# What to use

## Goal: make a coverage table for each of the gather results
- Make a presence absence table based of the f_match_orig
- So we can make a accumilation curve for MAGs recovered

-  f_match_orig would be approx % coverage: the percentage of a genome that is found in a metagenome

- Use about 80% for strain specificity

- 20% f_match_orig for species level


In [190]:
# imports
import pandas as pd
import sys
import csv
import argparse
from glob import glob
import os.path



In [191]:
acclist = glob('./gatherout/MAGs_gtdbk/*.csv')
acclist = [ os.path.basename(x) for x in acclist ]
acclist = [ x.split('.')[0] for x in acclist ]

# print number of files
print(len(acclist))

#acclist= ['SRR11183406', 'ERR1135186']

829


In [192]:
# create a dictionary for all gather output dataframes, where key = sample name 
sample_dfs = {}
for acc in acclist:
    sample_df = pd.read_csv(f'./gatherout/MAGs_gtdbk/{acc}.csv')
    sample_df  = sample_df[['name', 'f_match_orig']]
   # sample_df.rename(columns={'f_match_orig': acc}, inplace=True)
    sample_dfs[acc] = sample_df
    



In [193]:
sample = acclist.pop(0)
combined_df = sample_dfs[sample]
combined_df.rename(columns={'f_match_orig': sample}, inplace=True)


In [194]:
# and then... the rest!
while acclist:
    sample = acclist.pop(0)
    print(sample)
    sample_df = sample_dfs[sample]
    sample_df = sample_df[['name', 'f_match_orig']]
    sample_df.rename(columns={'f_match_orig': sample},
                        inplace=True)
    combined_df = combined_df.merge(sample_df, on='name',
                                        how='outer')
    combined_df.fillna(value=0, inplace=True)

SRR11183406
SRR11183374
SRR11183412
ERR1135348
SRR8960911
SRR11125799
SRR11125941
SRR15410036
SRR12795745
ERR1135189
SRR21276820
SRR11185261
SRR22460785
SRR8960285
SRR12795786
SRR11126521
SRR11125766
SRR5240747
SRR8960440
SRR22460801
SRR8960326
SRR14369295
SRR11183770
SRR21977526
ERR3211991
SRR5241539
ERR3211985
SRR8960118
SRR11124915
SRR11124901
ERR1135200
ERR1135214
ERR3212042
SRR11125410
SRR21266986
ERR1135228
SRR11125404
SRR11125405
ERR1135229
ERR8314740
SRR8960864
ERR1135215
ERR1135201
SRR17241510
SRR8960643
ERR3211984
SRR11551371
ERR3211990
ERR3211760
SRR11126332
SRR11126440
SRR11183765
ERR3211947
SRR21274830
SRR11184022
SRR8960327
SRR11125015
SRR11125767
SRR22460753
SRR11126508
SRR11126520
SRR12795787
ERR1135188
SRR21276809
SRR11183639
SRR8960086
SRR14369135
SRR11125559
SRR5976181
SRR11183407
SRR11126093
SRR11124684
SRR11126085
SRR8960721
SRR5976183
SRR5008237
SRR8960709
ERR1135363
SRR12795791
SRR22460745
SRR11125003
ERR3211979
SRR8960319
SRR2329609
ERR3211992
SRR11183559
ERR113

SRR2329775
ERR2020020
ERR1135633
SRR11125779
SRR10209670
SRR11183814
SRR12795772
ERR1135196
ERR1135182
SRR11125547
SRR11183380
SRR14812377
ERR1135357
ERR1135431
SRR8960729
SRR11183431
SRR8960715
ERR1855540
ERR1135390
SRR11183390
SRR17241481
SRR11126062
SRR11185246
SRR11125796
SRR11185252
ERR1135186
ERR1135192
SRR11125966
ERR1135179
SRR11125999
SRR11126506
SRR22460749
SRR21977515
SRR15732368
SRR8960473
SRR11183780
SRR8960498
SRR14369259
SRR5241536
SRR11125392
SRR11125423
SRR11125345
ERR8314772
ERR1135233
ERR1135227
ERR1135226
ERR1135232
SRR11124700
SRR8960880
SRR11124933
SRR11125634
SRR11126329
ERR3211790
ERR2020019
SRR11126513
SRR22460774
ERR1135178
ERR1135193
SRR8960512
ERR1135187
SRR11125797
ERR1135434
SRR11126077
SRR11124890
SRR8960710
SRR11183352
SRR11183434
SRR11124689
SRR11183387
SRR11125226
SRR11125232
ERR1135378
ERR1135191
ERR1135185
SRR22460776
ERR2020027
SRR11183807
SRR10209677
ERR3211792
SRR15410172
ERR3211989
SRR11126303
ERR1135218
ERR7197523
ERR1135224
SRR11124702
ERR11352

In [195]:
combined_df.set_index('name', inplace=True)


In [208]:
species = combined_df
strain = combined_df

In [227]:
species[species > 0.2] = 1
species[species == 0.2] = 1
species[species < 0.2] = 0
df_s = species.loc[(species!=0).any(axis=1)]

len(df_s)

20529

In [228]:
df_s.to_csv('./out_csvs/240123_species_presab.csv')


In [230]:
strain[strain > 0.75] = 1
strain[strain == 0.75] = 1
strain[strain < 0.75] = 0
df_st = strain.loc[(strain!=0).any(axis=1)]

len(df_st)

20509

In [232]:
df_st.to_csv('./out_csvs/240123_strain_presab.csv')