In [2]:
import polars as pl
from plotnine import *
from plotnine.themes import theme_set, theme_bw
theme_set(theme_bw())

In [3]:
kingfisher = pl.read_csv("../../data/sra/kingfisher_metadata.20220616.csv.gz", separator='\t', infer_schema_length=100000, ignore_errors=True)
# write ipc
print("Read {} rows".format(kingfisher.shape[0]))
# kingfisher.write_ipc('../../data/sra/kingfisher_metadata.20220616.ipc')

Read 295709 rows


In [4]:
def show_all(df, width=200, max_col_width=True):
    '''
    Prints an entire polars dataframe in the console or notebook output.
    Parameters
    ----------
    df : pl.DataFrame
        The dataframe to be printed.
    width : int, optional
        The width of the printed dataframe.
        Defaults to 200.
    max_col_width : bool, optional
        Whether to set the maximum column width.
        i.e. it will print the full contents of the cells.
        Defaults to True.
    '''
    with  pl.Config()  as  cfg:
        cfg.set_tbl_cols(-1)
        cfg.set_tbl_rows(-1)
        cfg.set_tbl_width_chars(width)
        if  max_col_width  or  len(df.columns) ==  1:
            cfg.set_fmt_str_lengths(width)
        print(df)

In [5]:
df = pl.read_csv('../../data/sra/acc_organism.csv.gz')
df[:3]

acc,bioproject,organism
str,str,str
"""SRR13213323""","""PRJNA506850""","""manure metagen…"
"""SRR1535388""","""PRJNA256106""","""human gut meta…"
"""SRR1535357""","""PRJNA256106""","""human gut meta…"


In [6]:
# Gather SMF
smf = pl.read_csv('../../data/sra/microbial_fractions.with_ave_genome_size.csv.gz', has_header=True, separator='\t')
smf[:3]

sample,bacterial_archaeal_bases,metagenome_size,read_fraction,average_bacterial_archaeal_genome_size,warning
str,i64,f64,str,i64,str
"""SRR8634435""",1178752140,1255000000.0,"""93.92%""",3419746,
"""SRR8640623""",1418455610,3274000000.0,"""43.32%""",1941733,
"""SRR8692214""",7569181622,7729000000.0,"""97.93%""",3353276,


In [7]:
m = df.join(smf, left_on='acc', right_on='sample', how='inner')
m.shape, m[:3]

((251835, 8),
 shape: (3, 8)
 ┌────────────┬────────────┬────────────┬────────────┬────────────┬───────────┬───────────┬─────────┐
 │ ---        ┆ ---        ┆ ---        ┆ archaeal_b ┆ _size      ┆ tion      ┆ acterial_ ┆ ---     │
 │ str        ┆ str        ┆ str        ┆ ases       ┆ ---        ┆ ---       ┆ archaeal_ ┆ str     │
 │            ┆            ┆            ┆ ---        ┆ f64        ┆ str       ┆ genom…    ┆         │
 │            ┆            ┆            ┆ i64        ┆            ┆           ┆ ---       ┆         │
 │            ┆            ┆            ┆            ┆            ┆           ┆ i64       ┆         │
 ╞════════════╪════════════╪════════════╪════════════╪════════════╪═══════════╪═══════════╪═════════╡
 │ SRR1321332 ┆ PRJNA50685 ┆ manure     ┆ 3748652624 ┆ 6.0250e9   ┆ 62.22%    ┆ 3803576   ┆ null    │
 │ 3          ┆ 0          ┆ metagenome ┆            ┆            ┆           ┆           ┆         │
 │ SRR1535388 ┆ PRJNA25610 ┆ human gut  ┆ 5158374718 

In [10]:
food = m.filter(pl.col('organism')=='food metagenome')
food.shape, food[:3]

((1654, 8),
 shape: (3, 8)
 ┌────────────┬────────────┬────────────┬────────────┬────────────┬───────────┬───────────┬─────────┐
 │ ---        ┆ ---        ┆ ---        ┆ archaeal_b ┆ _size      ┆ tion      ┆ acterial_ ┆ ---     │
 │ str        ┆ str        ┆ str        ┆ ases       ┆ ---        ┆ ---       ┆ archaeal_ ┆ str     │
 │            ┆            ┆            ┆ ---        ┆ f64        ┆ str       ┆ genom…    ┆         │
 │            ┆            ┆            ┆ i64        ┆            ┆           ┆ ---       ┆         │
 │            ┆            ┆            ┆            ┆            ┆           ┆ i64       ┆         │
 ╞════════════╪════════════╪════════════╪════════════╪════════════╪═══════════╪═══════════╪═════════╡
 │ SRR765683  ┆ PRJNA18598 ┆ food       ┆ 669429574  ┆ 1.8060e9   ┆ 37.07%    ┆ 3132420   ┆ null    │
 │            ┆ 1          ┆ metagenome ┆            ┆            ┆           ┆           ┆         │
 │ SRR8451843 ┆ PRJNA20344 ┆ food       ┆ 202843524  ┆ 

In [11]:
food2 = food.join(kingfisher, left_on='acc', right_on='run', how='inner')

In [13]:
food2.shape, food2.sample(3)

((1652, 6025),
 shape: (3, 6_025)
 ┌────────────┬────────────┬────────────┬────────────┬───┬─────────┬────────┬───────────┬───────────┐
 │ acc        ┆ bioproject ┆ organism   ┆ bacterial_ ┆ … ┆ mixture ┆ mating ┆ virus_enr ┆ sample_id │
 │ ---        ┆ ---        ┆ ---        ┆ archaeal_b ┆   ┆ ---     ┆ type   ┆ ich_appr2 ┆ _Miseq    │
 │ str        ┆ str        ┆ str        ┆ ases       ┆   ┆ str     ┆ ---    ┆ ---       ┆ ---       │
 │            ┆            ┆            ┆ ---        ┆   ┆         ┆ str    ┆ str       ┆ str       │
 │            ┆            ┆            ┆ i64        ┆   ┆         ┆        ┆           ┆           │
 ╞════════════╪════════════╪════════════╪════════════╪═══╪═════════╪════════╪═══════════╪═══════════╡
 │ SRR6659450 ┆ PRJNA43239 ┆ food       ┆ 281373960  ┆ … ┆ null    ┆ null   ┆ null      ┆ null      │
 │            ┆ 0          ┆ metagenome ┆            ┆   ┆         ┆        ┆           ┆           │
 │ ERR5740243 ┆ PRJEB44083 ┆ food       ┆ 653805

In [24]:
show_all(food2.group_by('study_title').len())

shape: (56, 2)
┌──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────┐
│ study_title                                                                                                                                                                          ┆ len │
│ ---                                                                                                                                                                                  ┆ --- │
│ str                                                                                                                                                                                  ┆ u32 │
╞══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╪═════╡
│ Romaine lettuce Raw sequence

In [31]:
# Soy Sauce 
food2.filter(pl.col('study_title').str.contains('Soy')).select('acc','read_fraction').sample(7)

acc,read_fraction
str,str
"""SRR648391""","""24.13%"""
"""SRR765682""","""55.80%"""
"""SRR765683""","""37.07%"""
"""SRR765684""","""43.46%"""
"""SRR765685""","""30.70%"""
"""SRR765686""","""35.87%"""
"""SRR765687""","""19.84%"""
