In [1]:
import os
os.chdir("../Table4")

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
RANKS = ['class', 'order', 'family', 'genus', 'species']

In [4]:
def evaluation(preds, s_score, e_score, step):
    tmp = []
    for cutoff in np.arange(s_score, e_score, step):
        sb = {}
        for rank in RANKS:
        #    subdf = df.dropna(subset=[f"{rank}_label"])

            P = preds.query(f"{rank}_score >= @cutoff")
            TP = P.query(f"{rank}_label == {rank}_pred").shape[0]
            FP = P.query(f"{rank}_label != {rank}_pred").shape[0]

            N = preds.query(f"{rank}_score < @cutoff")
            FN = N.dropna(subset=[f"{rank}_label"]).shape[0]
            TN = N.shape[0] - FN

            recall = TP / (TP + FN)
            precision = TP / (TP + FP)
            f1_score = 2 * recall * precision / (recall + precision)

            sb[rank] = {
                'Recall': recall,
                'Precision': precision,
                'F1-score': f1_score
            }

        sb = pd.DataFrame(sb).T.reset_index()
        sb.columns = ['rank', 'Recall', 'Precision', 'F1-score']
        sb['cutoff'] = cutoff
        tmp.append(sb)

    return pd.concat(tmp)

In [5]:
no_mcm = pd.read_csv("DeepCOI/DeepCOI.no_mcm.summary.csv", sep=',', index_col=0)
no_mcm.head(n=2)

Unnamed: 0,class_label,class_pred,class_score,order_label,order_pred,order_score,family_label,family_pred,family_score,genus_label,genus_pred,genus_score,species_label,species_pred,species_score
RRMFE1128-15,Insecta,Insecta,1.0,Hymenoptera,Hymenoptera,1.0,Eulophidae,Eulophidae,0.993,Aprostocetus,Deutereulophus,0.01,,Deutereulophus Malaise6791,0.0013
RRMPC002-15,Insecta,Insecta,1.0,Hemiptera,Hemiptera,1.0,Cicadellidae,Cicadellidae,1.0,Dikraneura,Dikraneura,0.9313,Dikraneura mali,Dikraneura mali,0.9313


In [6]:
nm_sb = evaluation(no_mcm, 0.5, 1.0, 0.05)

In [7]:
nm_sb.query("cutoff >= 0.9 and cutoff < 0.95")

Unnamed: 0,rank,Recall,Precision,F1-score,cutoff
0,class,0.983704,0.99859,0.991091,0.9
1,order,0.929015,0.991223,0.959111,0.9
2,family,0.739371,0.985715,0.844954,0.9
3,genus,0.597806,0.596414,0.597109,0.9
4,species,0.973973,0.819754,0.890234,0.9


In [8]:
deepcoi = pd.read_csv("DeepCOI/DeepCOI.summary.csv", sep=',', index_col=0)
deepcoi.head(n=2)

Unnamed: 0,class_label,class_pred,class_score,order_label,order_pred,order_score,family_label,family_pred,family_score,genus_label,genus_pred,genus_score,species_label,species_pred,species_score
RRMFE1128-15,Insecta,Insecta,1.0,Hymenoptera,Hymenoptera,1.0,Eulophidae,Eulophidae,0.993,Aprostocetus,Deutereulophus,0.01,,Deutereulophus Malaise6791,0.0013
RRMPC002-15,Insecta,Insecta,1.0,Hemiptera,Hemiptera,1.0,Cicadellidae,Cicadellidae,1.0,Dikraneura,Dikraneura,0.9313,Dikraneura mali,Dikraneura mali,0.9313


In [9]:
dc_sb = evaluation(deepcoi, 0.5, 1.0, 0.05)

In [10]:
dc_sb.query("cutoff >= 0.9 and cutoff < 0.95")

Unnamed: 0,rank,Recall,Precision,F1-score,cutoff
0,class,0.986588,0.99854,0.992528,0.9
1,order,0.937902,0.991421,0.963919,0.9
2,family,0.744214,0.985506,0.84803,0.9
3,genus,0.613642,0.599361,0.606417,0.9
4,species,0.973973,0.820069,0.89042,0.9


In [11]:
rdp = pd.read_csv("RDP/DS-PBBC.rdp.csv", sep=',', index_col=0)
rdp.head(n=2)

Unnamed: 0,class_label,class_pred,class_score,order_label,order_pred,order_score,family_label,family_pred,family_score,genus_label,genus_pred,genus_score,species_label,species_pred,species_score
RRMFA002-15,Insecta,Insecta,0.99,Diptera,Diptera,0.98,Chironomidae,Limoniidae,0.72,Parakiefferiella,Dicranomyia,0.72,,Dicranomyia sp.,0.72
RRMFA010-15,Insecta,Insecta,0.98,Diptera,Diptera,0.88,Chironomidae,Limoniidae,0.55,Micropsectra,Dicranomyia,0.55,,Dicranomyia sp.,0.55


In [12]:
rdp_sb = evaluation(rdp, 0.5, 1.0, 0.05)

In [13]:
rdp_sb.query("cutoff >= 0.8 and cutoff < 0.85")

Unnamed: 0,rank,Recall,Precision,F1-score,cutoff
0,class,0.804323,0.978268,0.882809,0.8
1,order,0.621081,0.963621,0.75533,0.8
2,family,0.324572,0.795029,0.460958,0.8
3,genus,0.637632,0.482873,0.549566,0.8
4,species,0.999544,0.473014,0.642145,0.8


In [14]:
blast = pd.read_csv("BLAST/DS-PBBC.blast.csv", sep=',', index_col=0)
blast.head(n=2)

Unnamed: 0,class_label,class_pred,class_score,order_label,order_pred,order_score,family_label,family_pred,family_score,genus_label,genus_pred,genus_score,species_label,species_pred,species_score
RRMFA002-15,Insecta,Insecta,0.88432,Diptera,Diptera,0.88432,Chironomidae,Chironomidae,0.88432,Parakiefferiella,Parakiefferiella,0.88432,,Parakiefferiella scandica,0.88432
RRMFA010-15,Insecta,Insecta,0.87386,Diptera,Diptera,0.87386,Chironomidae,Chironomidae,0.87386,Micropsectra,Micropsectra,0.87386,,Micropsectra recurvata,0.87386


In [15]:
blast_sb = pd.concat([
    evaluation(blast, 0.5, 1.0, 0.05),
    evaluation(blast, 0.94, 0.95, 0.05),
])

In [16]:
b1 = blast_sb.query("rank in ['class', 'order', 'family'] and cutoff >= 0.75 and cutoff < 0.8")
b2 = blast_sb.query("rank == 'genus' and cutoff >= 0.9 and cutoff < 0.94")
b3 = blast_sb.query("rank == 'species' and cutoff >= 0.94 and cutoff < 0.95")

pd.concat([b1, b2, b3])

Unnamed: 0,rank,Recall,Precision,F1-score,cutoff
0,class,0.988334,0.992389,0.990357,0.75
1,order,0.987801,0.931012,0.958566,0.75
2,family,0.996496,0.752281,0.857336,0.75
3,genus,0.685884,0.483202,0.566974,0.9
4,species,0.999543,0.783745,0.878587,0.94


# Supplementary Table S5

In [17]:
def get_exact_counts(df, rank, cutoff):
    labeled = df.dropna(subset=[f"{rank}_label"])
    novel = df.loc[df[f"{rank}_label"].isna()]
    
    TP = labeled.query(f"{rank}_score >= @cutoff and {rank}_pred == {rank}_label").shape[0]
    FP = labeled.query(f"{rank}_score >= @cutoff and {rank}_pred != {rank}_label").shape[0]
    FN = labeled.query(f"{rank}_score < @cutoff").shape[0]
    
    FP_novel = novel.query(f"{rank}_score >= @cutoff").shape[0]
    TN = novel.query(f"{rank}_score < @cutoff").shape[0]
    
    return (labeled.shape[0], novel.shape[0]), (TP, FP, FN, FP_novel, TN)

In [18]:
rank='class'
print(get_exact_counts(no_mcm, rank, 0.9))
print(get_exact_counts(deepcoi, rank, 0.9))
print(get_exact_counts(rdp, rank, 0.8))
print(get_exact_counts(blast, rank, 0.75))

((18742, 0), (18411, 26, 305, 0, 0))
((18742, 0), (18464, 27, 251, 0, 0))
((18742, 0), (14951, 368, 3423, 0, 0))
((18742, 0), (18385, 141, 216, 0, 0))


In [19]:
rank='order'
print(get_exact_counts(no_mcm, rank, 0.9))
print(get_exact_counts(deepcoi, rank, 0.9))
print(get_exact_counts(rdp, rank, 0.8))
print(get_exact_counts(blast, rank, 0.75))

((18506, 236), (17053, 150, 1303, 1, 235))
((18506, 236), (17218, 148, 1140, 1, 235))
((18506, 236), (11357, 557, 6592, 1, 235))
((18506, 236), (17248, 1046, 212, 232, 4))


In [20]:
rank='family'
print(get_exact_counts(no_mcm, rank, 0.9))
print(get_exact_counts(deepcoi, rank, 0.9))
print(get_exact_counts(rdp, rank, 0.8))
print(get_exact_counts(blast, rank, 0.75))

((17692, 1050), (12973, 146, 4573, 42, 1008))
((17692, 1050), (13055, 150, 4487, 42, 1008))
((17692, 1050), (5412, 1511, 10769, 42, 1008))
((17692, 1050), (13936, 3708, 48, 882, 168))


In [21]:
rank='genus'
print(get_exact_counts(no_mcm, rank, 0.9))
print(get_exact_counts(deepcoi, rank, 0.9))
print(get_exact_counts(rdp, rank, 0.8))
print(get_exact_counts(blast, rank, 0.9))

((4295, 14447), (2561, 11, 1723, 1722, 12725))
((4295, 14447), (2627, 14, 1654, 1742, 12705))
((4295, 14447), (2472, 520, 1303, 2357, 12090))
((4295, 14447), (2891, 82, 1322, 3043, 11404))


In [22]:
rank='species'
print(get_exact_counts(no_mcm, rank, 0.9))
print(get_exact_counts(deepcoi, rank, 0.9))
print(get_exact_counts(rdp, rank, 0.8))
print(get_exact_counts(blast, rank, 0.94))

((2274, 16468), (2133, 84, 57, 385, 16083))
((2274, 16468), (2133, 84, 57, 384, 16084))
((2274, 16468), (2191, 82, 1, 2581, 13887))
((2274, 16468), (2189, 84, 1, 520, 15948))
