In [None]:
import pandas as pd
import numpy as np
import glob
import os
import re

In [None]:
fmg_csvs = glob.glob("./output.roux2017/tresh/*.csv")
def extract_parameters_from_filename(filename):
    match = re.search(r'k(\d+)-sc(\d+).t(\d+)', filename)
    if match:
        ksize = int(match.group(1))
        scaled = int(match.group(2))
        threshold = int(match.group(3))
        return ksize, scaled, threshold
    else:
        return None, None, None

In [None]:
fmg_dfs = []
for inf in fmg_csvs:
    ksize, scaled, threshold = extract_parameters_from_filename(inf)
    inD = pd.read_csv(inf)
    inD['ksize'] = ksize
    inD['scaled'] = scaled
    inD['threshold'] = threshold
    fmg_dfs.append(inD)

fmg = pd.concat(fmg_dfs)

In [18]:
# open actual sample composition
df_sc = pd.read_csv('samplecomp.csv', sep ='\t', 
                    names=['query_name','gi_num_sc', 'match_name', 'coverage_mapping'], header=None)

# open list of Refseq219 that are present absent in version of refseq they used. 
df_diff_rs = pd.read_csv('refseq_diff.csv')

# smas
df_smash = fmg

In [19]:
# clean up sourmash output so we can compare gi numbers
df_smash[['gi_str','gi_num_smash', 'ref_str', 'ref', 'match_name']] = df_smash.name.str.split("|",expand=True)
#df_smash = df_smash[['query_name','match_name','f_match_query', 'intersect_bp','gi_num_smash', 'ref' ]]
df_smash['match_name'] = df_smash.match_name.str.split(",",n=1).str.get(0)
df_smash['match_name'] = df_smash['match_name'].str.replace(" ", "", n = 1)
df_smash.gi_num_smash = pd.to_numeric(df_smash.gi_num_smash, errors='coerce')


# clean up sample composition df
df_sc['query_name'] = df_sc['query_name'].str.replace(r'_comp.tsv', '', regex=True)
df_sc = df_sc[~df_sc.gi_num_sc.str.contains("## Virus")]
# df_sc.gi_num_sc = pd.to_numeric(df_sc.gi_num_sc, errors='coerce')

# merge presence absense with sample composition
df_sc = df_sc.merge(df_diff_rs, on='match_name', how='inner')

# drop where the sample composition is not in this version of RefSeq
df_sc = df_sc[~df_sc.present_in_rs_v219.str.contains("no")]

In [20]:
# all viruses detected by sourmash (both true and false positives)
df_smashallpos = df_smash.groupby(['ksize', 'scaled', 'threshold']).value_counts(['query_name']).reset_index(name='all_pos_smash')

# reference values (control values)
df_ref = df_sc.value_counts(['query_name']).reset_index(name='present_roux2017')

# merge smash results with sample comp
# all true positives found by smash 
df_smashpos = df_smash.merge(df_sc, on=['match_name', 'query_name'], how='inner')

# allentries (including false neg and false pos)
df_smashall = df_smash.merge(df_sc, on=['match_name', 'query_name'], how='outer')



In [None]:

# count true positives and all positives (including false positives)
df_smashpos = df_smashpos.groupby(['ksize', 'scaled', 'threshold']).value_counts(['query_name']).reset_index(name='true_pos_smash')
df_smashall = df_smashall.groupby(['ksize', 'scaled', 'threshold']).value_counts(['query_name']).reset_index(name='all_entries')

df_all = df_ref.merge(df_smashpos, on='query_name')

df_all = df_all.merge(df_smashall, 
                      on=['ksize', 'scaled', 'threshold', 'query_name']).merge(df_smashallpos, 
                                                                               on=['ksize', 'scaled', 'threshold', 'query_name']) 

   
df_all['false_negative'] = df_all['all_entries'] - df_all['all_pos_smash']
df_all['false_positive'] = df_all['all_pos_smash'] - df_all['true_pos_smash']



In [21]:
# calculate precision, recall and f1 scores for each sample
# precision
df_all['precision'] = df_all['true_pos_smash'] / (df_all['true_pos_smash'] + df_all['false_positive'])

# recall
df_all['recall'] = df_all['true_pos_smash'] / (df_all['true_pos_smash'] + df_all['false_negative'])

# F1
df_all['F1'] = 2 * (df_all['precision'] * df_all['recall']) / (df_all['precision'] + df_all['recall'])


In [22]:
df_all

Unnamed: 0,Sample,present_roux2017,true_pos_smash,all_entries,all_pos_smash,false_negative,false_positive,precision,recall,F1
0,Sample_11,915,420,1382,886,496,466,0.474041,0.458515,0.466149
1,Sample_3,899,330,1492,920,572,590,0.358696,0.365854,0.362239
2,Sample_12,859,357,1199,697,502,340,0.512195,0.4156,0.458869
3,Sample_1,767,240,1489,962,527,722,0.24948,0.312907,0.277617
4,Sample_2,766,300,1380,911,469,611,0.329308,0.390117,0.357143
5,Sample_14,754,263,1450,959,491,696,0.274244,0.348806,0.307064
6,Sample_7,723,236,1431,944,487,708,0.25,0.326418,0.283143
7,Sample_13,658,304,1251,896,355,592,0.339286,0.461305,0.390997
8,Sample_6,649,244,1364,959,405,715,0.254432,0.375963,0.303483
9,Sample_8,607,199,1381,972,409,773,0.204733,0.327303,0.251899
