In [2]:
import polars as pl

In [3]:
df = pl.read_csv('../data/sra/microbial_fractions.with_ave_genome_size.csv.gz', separator='\t')
df[:3], df.shape

(shape: (3, 6)
 ┌────────────┬────────────────────┬─────────────────┬───────────────┬────────────────────┬─────────┐
 │ ---        ┆ _bases             ┆ ---             ┆ ---           ┆ archaeal_genom…    ┆ ---     │
 │ str        ┆ ---                ┆ f64             ┆ str           ┆ ---                ┆ str     │
 │            ┆ i64                ┆                 ┆               ┆ i64                ┆         │
 ╞════════════╪════════════════════╪═════════════════╪═══════════════╪════════════════════╪═════════╡
 │ SRR8634435 ┆ 1178752140         ┆ 1.2550e9        ┆ 93.92%        ┆ 3419746            ┆ null    │
 │ SRR8640623 ┆ 1418455610         ┆ 3.2740e9        ┆ 43.32%        ┆ 1941733            ┆ null    │
 │ SRR8692214 ┆ 7569181622         ┆ 7.7290e9        ┆ 97.93%        ┆ 3353276            ┆ null    │
 └────────────┴────────────────────┴─────────────────┴───────────────┴────────────────────┴─────────┘,
 (251919, 6))

In [4]:
df = df.with_columns(pl.col('read_fraction').str.replace('%','').cast(pl.Float32).alias('microbial_fraction (%)'))
df[:3]

sample,bacterial_archaeal_bases,metagenome_size,read_fraction,average_bacterial_archaeal_genome_size,warning,microbial_fraction (%)
str,i64,f64,str,i64,str,f32
"""SRR8634435""",1178752140,1255000000.0,"""93.92%""",3419746,,93.919998
"""SRR8640623""",1418455610,3274000000.0,"""43.32%""",1941733,,43.32
"""SRR8692214""",7569181622,7729000000.0,"""97.93%""",3353276,,97.93


In [5]:
# Filter rows based on NCBI method metadata
# ncbi_method <- read_delim("data/sra/NCBI_method_taxonomy_processed.csv.gz")
#   filter(library_strategy == "WGS" & library_selection == "RANDOM") %>%
#   filter(gbp > 0.5) 
extra_metadata = pl.read_csv('../data/sra/extra_metadata_short.tsv.gz', separator='\t', null_values=['NA','null'])
# print(extra_metadata.shape)
# show_all(extra_metadata[:3])
extra_metadata = extra_metadata.filter(pl.col('library_strategy') == 'WGS').filter(pl.col('library_selection') == 'RANDOM').filter(pl.col('Gbp') > 0.5)
# print("Found {} SRA runs that pass the metadata filtering".format(extra_metadata.shape[0]))
original_smf_count = df.shape[0]
df = df.join(extra_metadata, left_on='sample', right_on='run', how='inner')
print("Found {} SRA runs that pass the metadata filtering out of {} original".format(df.shape[0], original_smf_count))

Found 136284 SRA runs that pass the metadata filtering out of 251919 original


In [6]:
## Commented out to avoid overwriting the file
# df.select('sample','bacterial_archaeal_bases','metagenome_size','microbial_fraction (%)','average_bacterial_archaeal_genome_size','warning').write_csv('../data/Supplementary_Data_1.csv')

# How many had warnings?

In [9]:
len(df.filter(pl.col('warning').is_not_null())) / len(df)

0.002839658360482522