In [1]:
import polars as pl

In [2]:
df = pl.read_csv('../data/sra/microbial_fractions.with_ave_genome_size.csv.gz', separator='\t')
df[:3], df.shape

(shape: (3, 6)
 ┌────────────┬────────────────────┬─────────────────┬───────────────┬────────────────────┬─────────┐
 │ ---        ┆ _bases             ┆ ---             ┆ ---           ┆ archaeal_genom…    ┆ ---     │
 │ str        ┆ ---                ┆ f64             ┆ str           ┆ ---                ┆ str     │
 │            ┆ i64                ┆                 ┆               ┆ i64                ┆         │
 ╞════════════╪════════════════════╪═════════════════╪═══════════════╪════════════════════╪═════════╡
 │ SRR8634435 ┆ 1178752140         ┆ 1.2550e9        ┆ 93.92%        ┆ 3419746            ┆ null    │
 │ SRR8640623 ┆ 1418455610         ┆ 3.2740e9        ┆ 43.32%        ┆ 1941733            ┆ null    │
 │ SRR8692214 ┆ 7569181622         ┆ 7.7290e9        ┆ 97.93%        ┆ 3353276            ┆ null    │
 └────────────┴────────────────────┴─────────────────┴───────────────┴────────────────────┴─────────┘,
 (251919, 6))

In [3]:
df = df.with_columns(pl.col('read_fraction').str.replace('%','').cast(pl.Float32).alias('microbial_fraction (%)'))
df[:3]

sample,bacterial_archaeal_bases,metagenome_size,read_fraction,average_bacterial_archaeal_genome_size,warning,microbial_fraction (%)
str,i64,f64,str,i64,str,f32
"""SRR8634435""",1178752140,1255000000.0,"""93.92%""",3419746,,93.919998
"""SRR8640623""",1418455610,3274000000.0,"""43.32%""",1941733,,43.32
"""SRR8692214""",7569181622,7729000000.0,"""97.93%""",3353276,,97.93


In [4]:
# Filter rows based on NCBI method metadata
# ncbi_method <- read_delim("data/sra/NCBI_method_taxonomy_processed.csv.gz")
#   filter(library_strategy == "WGS" & library_selection == "RANDOM") %>%
#   filter(gbp > 0.5) 
extra_metadata = pl.read_csv('../data/sra/extra_metadata_short.tsv.gz', separator='\t', null_values=['NA','null'])
# print(extra_metadata.shape)
# show_all(extra_metadata[:3])
extra_metadata = extra_metadata.filter(pl.col('library_strategy') == 'WGS').filter(pl.col('library_selection') == 'RANDOM').filter(pl.col('Gbp') > 0.5)
# print("Found {} SRA runs that pass the metadata filtering".format(extra_metadata.shape[0]))
original_smf_count = df.shape[0]
df = df.join(extra_metadata, left_on='sample', right_on='run', how='inner')
print("Found {} SRA runs that pass the metadata filtering out of {} original".format(df.shape[0], original_smf_count))

Found 136284 SRA runs that pass the metadata filtering out of 251919 original


In [5]:
## Commented out to avoid overwriting the file
# df.select('sample','bacterial_archaeal_bases','metagenome_size','microbial_fraction (%)','average_bacterial_archaeal_genome_size','warning').write_csv('../data/Supplementary_Data_1.csv')

# How many had warnings?

In [6]:
len(df.filter(pl.col('warning').is_not_null())) / len(df)

0.002839658360482522

# Gather marine info

In [8]:
marine_filter_sizes = pl.read_csv('marine/marine_smf_and_filter_sizes.csv')
marine_filter_sizes.shape, marine_filter_sizes[:3]

((1306, 17),
 shape: (3, 17)
 ┌───────────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬───────┐
 │ sample    ┆ bacterial_ ┆ metagenome ┆ read_fract ┆ … ┆ lower_bou ┆ upper_bou ┆ size_rang ┆ smf   │
 │ ---       ┆ archaeal_b ┆ _size      ┆ ion        ┆   ┆ nd        ┆ nd        ┆ e         ┆ ---   │
 │ str       ┆ ases       ┆ ---        ┆ ---        ┆   ┆ ---       ┆ ---       ┆ ---       ┆ f64   │
 │           ┆ ---        ┆ f64        ┆ str        ┆   ┆ f64       ┆ f64       ┆ str       ┆       │
 │           ┆ i64        ┆            ┆            ┆   ┆           ┆           ┆           ┆       │
 ╞═══════════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════╡
 │ ERR598976 ┆ 2804781680 ┆ 3.9094e10  ┆ 71.74%     ┆ … ┆ 0.22      ┆ 3.0       ┆ 0.22-3.0  ┆ 71.74 │
 │           ┆ 0          ┆            ┆            ┆   ┆           ┆           ┆           ┆       │
 │ ERR599295 ┆ 713946683  ┆ 2.9695e10  ┆ 2.40%      ┆

In [13]:
df = df.join(
marine_filter_sizes.select(
    'sample','size_range'
).rename({'size_range': 'marine sample filter size range (um)'}), on='sample', how='left')

# Gather soil info

sample,bacterial_archaeal_bases,metagenome_size,read_fraction,average_bacterial_archaeal_genome_size,warning,microbial_fraction (%),bioproject,Gbp,library_strategy,library_selection,model,sample_name,taxon_name,marine sample filter size range (um)
str,i64,f64,str,i64,str,f32,str,f64,str,str,str,str,str,str
"""SRR8692214""",7569181622,7.7290e9,"""97.93%""",3353276,,97.93,"""PRJNA296814""",7.73,"""WGS""","""RANDOM""","""NextSeq 500""","""nhbcs_06187001…","""human gut meta…",
"""SRR8675913""",3552674829,5.0900e9,"""69.80%""",4268554,,69.800003,"""PRJNA431482""",5.091,"""WGS""","""RANDOM""","""Illumina HiSeq…","""N-49""","""human gut meta…",
"""SRR8663617""",2365776296,9.2750e9,"""25.51%""",3973690,,25.51,"""PRJNA525405""",9.276,"""WGS""","""RANDOM""","""NextSeq 500""","""AP-DNA 1""","""air metagenome…",
"""SRR8675889""",7309031262,7.8190e9,"""93.48%""",4523391,,93.480003,"""PRJNA431482""",7.82,"""WGS""","""RANDOM""","""Illumina HiSeq…","""N-22""","""human gut meta…",
"""SRR8609101""",3264659414,4.5660e9,"""71.50%""",3823592,,71.5,"""PRJNA523806""",4.567,"""WGS""","""RANDOM""","""NextSeq 500""","""contr_1_sra""","""mouse gut meta…",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""SRR9924787""",11597333858,1.4346e10,"""80.84%""",2145981,,80.839996,"""PRJNA559231""",14.347,"""WGS""","""RANDOM""","""Illumina HiSeq…","""6XLD_WA""","""freshwater met…",
"""SRR9965713""",285791506,1.1810e9,"""24.20%""",2693097,,24.200001,"""PRJNA558989""",1.181,"""WGS""","""RANDOM""","""Illumina HiSeq…","""mWGS_124""","""human skin met…",
"""SRR9943783""",1674393030,2.8900e9,"""57.94%""",3056131,,57.939999,"""PRJNA559605""",2.891,"""WGS""","""RANDOM""","""NextSeq 500""","""LP7-M""","""metagenome""",
"""SRR9948921""",4472338018,5.6920e9,"""78.57%""",5028432,,78.57,"""PRJNA555798""",5.693,"""WGS""","""RANDOM""","""Illumina HiSeq…","""Biofilm-B16""","""biofilm metage…",
