In [18]:
import polars as pl

In [19]:
df = pl.read_csv('../data/sra/microbial_fractions.with_ave_genome_size.csv.gz', separator='\t')
df[:3], df.shape

(shape: (3, 6)
 ┌────────────┬────────────────────┬─────────────────┬───────────────┬────────────────────┬─────────┐
 │ ---        ┆ _bases             ┆ ---             ┆ ---           ┆ archaeal_genom…    ┆ ---     │
 │ str        ┆ ---                ┆ f64             ┆ str           ┆ ---                ┆ str     │
 │            ┆ i64                ┆                 ┆               ┆ i64                ┆         │
 ╞════════════╪════════════════════╪═════════════════╪═══════════════╪════════════════════╪═════════╡
 │ SRR8634435 ┆ 1178752140         ┆ 1.2550e9        ┆ 93.92%        ┆ 3419746            ┆ null    │
 │ SRR8640623 ┆ 1418455610         ┆ 3.2740e9        ┆ 43.32%        ┆ 1941733            ┆ null    │
 │ SRR8692214 ┆ 7569181622         ┆ 7.7290e9        ┆ 97.93%        ┆ 3353276            ┆ null    │
 └────────────┴────────────────────┴─────────────────┴───────────────┴────────────────────┴─────────┘,
 (251919, 6))

In [20]:
df = df.with_columns(pl.col('read_fraction').str.replace('%','').cast(pl.Float32).alias('microbial_fraction (%)'))
df[:3]

sample,bacterial_archaeal_bases,metagenome_size,read_fraction,average_bacterial_archaeal_genome_size,warning,microbial_fraction (%)
str,i64,f64,str,i64,str,f32
"""SRR8634435""",1178752140,1255000000.0,"""93.92%""",3419746,,93.919998
"""SRR8640623""",1418455610,3274000000.0,"""43.32%""",1941733,,43.32
"""SRR8692214""",7569181622,7729000000.0,"""97.93%""",3353276,,97.93


In [21]:
# Filter rows based on NCBI method metadata
# ncbi_method <- read_delim("data/sra/NCBI_method_taxonomy_processed.csv.gz")
#   filter(library_strategy == "WGS" & library_selection == "RANDOM") %>%
#   filter(gbp > 0.5) 
extra_metadata = pl.read_csv('../data/sra/extra_metadata_short.tsv.gz', separator='\t', null_values=['NA','null'])
# print(extra_metadata.shape)
# show_all(extra_metadata[:3])
extra_metadata = extra_metadata.filter(pl.col('library_strategy') == 'WGS').filter(pl.col('library_selection') == 'RANDOM').filter(pl.col('Gbp') > 0.5)
# print("Found {} SRA runs that pass the metadata filtering".format(extra_metadata.shape[0]))
original_smf_count = df.shape[0]
df = df.join(extra_metadata, left_on='sample', right_on='run', how='inner')
print("Found {} SRA runs that pass the metadata filtering out of {} original".format(df.shape[0], original_smf_count))

Found 136284 SRA runs that pass the metadata filtering out of 251919 original


In [22]:
## Commented out to avoid overwriting the file (and now deprecated anyway)
# df.select('sample','bacterial_archaeal_bases','metagenome_size','microbial_fraction (%)','average_bacterial_archaeal_genome_size','warning').write_csv('../data/Supplementary_Data_1.csv')

# How many had warnings?

In [23]:
len(df.filter(pl.col('warning').is_not_null())) / len(df)

0.002839658360482522

# Gather marine info

In [24]:
marine_filter_sizes = pl.read_csv('marine/marine_smf_and_filter_sizes.csv')
marine_filter_sizes.shape, marine_filter_sizes[:3]

((1306, 17),
 shape: (3, 17)
 ┌───────────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬───────┐
 │ sample    ┆ bacterial_ ┆ metagenome ┆ read_fract ┆ … ┆ lower_bou ┆ upper_bou ┆ size_rang ┆ smf   │
 │ ---       ┆ archaeal_b ┆ _size      ┆ ion        ┆   ┆ nd        ┆ nd        ┆ e         ┆ ---   │
 │ str       ┆ ases       ┆ ---        ┆ ---        ┆   ┆ ---       ┆ ---       ┆ ---       ┆ f64   │
 │           ┆ ---        ┆ f64        ┆ str        ┆   ┆ f64       ┆ f64       ┆ str       ┆       │
 │           ┆ i64        ┆            ┆            ┆   ┆           ┆           ┆           ┆       │
 ╞═══════════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════╡
 │ ERR598976 ┆ 2804781680 ┆ 3.9094e10  ┆ 71.74%     ┆ … ┆ 0.22      ┆ 3.0       ┆ 0.22-3.0  ┆ 71.74 │
 │           ┆ 0          ┆            ┆            ┆   ┆           ┆           ┆           ┆       │
 │ ERR599295 ┆ 713946683  ┆ 2.9695e10  ┆ 2.40%      ┆

In [25]:
df = df.join(
marine_filter_sizes.select(
    'sample','size_range'
).rename({'size_range': 'marine sample filter size range (um)'}), on='sample', how='left')

In [30]:
df.filter(pl.col('sample').is_in(marine_filter_sizes.select('sample')))[:3]

sample,bacterial_archaeal_bases,metagenome_size,read_fraction,average_bacterial_archaeal_genome_size,warning,microbial_fraction (%),bioproject,Gbp,library_strategy,library_selection,model,sample_name,taxon_name,marine sample filter size range (um),Fungi to Bacteria Ratio (soil)
str,i64,f64,str,i64,str,f32,str,f64,str,str,str,str,str,str,f64
"""ERR598976""",28047816800,39094000000.0,"""71.74%""",1608800,,71.739998,"""PRJEB1787""",39.095,"""WGS""","""RANDOM""","""Illumina HiSeq…","""TARA_B10000156…","""marine metagen…","""0.22-3.0""",
"""ERR599295""",713946683,29695000000.0,"""2.40%""",3766932,,2.4,"""PRJEB4352""",29.696,"""WGS""","""RANDOM""","""Illumina HiSeq…","""TARA_N00000074…","""marine metagen…","""5.0-20.0""",
"""ERR599069""",7986370080,9401000000.0,"""84.95%""",1677284,,84.949997,"""PRJEB1787""",9.402,"""WGS""","""RANDOM""","""Illumina HiSeq…","""TARA_B10000067…","""marine metagen…","""0.22-3.0""",


# Gather soil info

In [27]:
soil_fungal_ratios = pl.read_csv('soil/soil_data_for_supplementary.csv')
soil_fungal_ratios.shape, soil_fungal_ratios[:3]

((3960, 29),
 shape: (3, 29)
 ┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬──────────┬───────────┐
 │ acc       ┆ bioprojec ┆ organism  ┆ bacterial ┆ … ┆ host_or_n ┆ host_or_n ┆ tropical ┆ Fungi to  │
 │ ---       ┆ t         ┆ ---       ┆ _archaeal ┆   ┆ ot_record ┆ ot_mature ┆ ---      ┆ Bacteria  │
 │ str       ┆ ---       ┆ str       ┆ _bases    ┆   ┆ ed        ┆ ---       ┆ bool     ┆ Ratio     │
 │           ┆ str       ┆           ┆ ---       ┆   ┆ ---       ┆ str       ┆          ┆ ---       │
 │           ┆           ┆           ┆ i64       ┆   ┆ str       ┆           ┆          ┆ f64       │
 ╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
 │ ERR174226 ┆ PRJEB8420 ┆ soil meta ┆ 914834837 ┆ … ┆ ecologica ┆ ecologica ┆ false    ┆ 0.059587  │
 │ 3         ┆           ┆ genome    ┆ 9         ┆   ┆ l         ┆ l         ┆          ┆           │
 │ SRR747173 ┆ PRJNA4645 ┆ soil meta ┆ 491501756 ┆ … 

In [28]:
df = df.join(
soil_fungal_ratios.select(
    'acc','Fungi to Bacteria Ratio'
).rename({'acc': 'sample', 'Fungi to Bacteria Ratio': 'Fungi to Bacteria Ratio (soil)'}), on='sample', how='left')

In [31]:
df.filter(pl.col('sample').is_in(soil_fungal_ratios.select('acc')))[:3]

sample,bacterial_archaeal_bases,metagenome_size,read_fraction,average_bacterial_archaeal_genome_size,warning,microbial_fraction (%),bioproject,Gbp,library_strategy,library_selection,model,sample_name,taxon_name,marine sample filter size range (um),Fungi to Bacteria Ratio (soil)
str,i64,f64,str,i64,str,f32,str,f64,str,str,str,str,str,str,f64
"""ERR2239870""",635913170,942000000.0,"""67.51%""",4221970,,67.510002,"""PRJEB24343""",0.943,"""WGS""","""RANDOM""","""Illumina HiSeq…","""LEP-SZ-8-T""","""soil metagenom…",,0.057234
"""ERR2233332""",1526053390,2293000000.0,"""66.55%""",4746224,,66.550003,"""PRJEB24179""",2.293,"""WGS""","""RANDOM""","""Illumina HiSeq…","""CJ006""","""soil metagenom…",,0.059775
"""ERR2239869""",631240409,971000000.0,"""65.01%""",4347985,,65.010002,"""PRJEB24343""",0.971,"""WGS""","""RANDOM""","""Illumina HiSeq…","""LEP-SZ-8-B""","""soil metagenom…",,0.064008


In [35]:
import gzip
with gzip.open('supplementary_data_from_python.csv.gz','wb') as f:
    df.write_csv(f)