In [1]:
import pandas as pd
from Bio import SeqIO
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import os
os.chdir("../Sup_Table_S6")

In [2]:
PHYLA = ['Actinobacteria', 'Bacteroidetes', 'Cyanobacteria', 'Firmicutes', 'Proteobacteria']

In [5]:
# Total number of queries
def load_nseqs(file_name):
    parser = SeqIO.index(file_name, format='fasta')
    
    return len(parser.keys())

n_queries = {
    phylum: load_nseqs(f"data/{phylum}.COI.fasta")
    for phylum in PHYLA
}

In [6]:
# DeepCOI-phylum
low_score = {}
outgroups = {}

for phylum in PHYLA:
    preds = pd.read_csv(f"DeepCOI/{phylum}.phylum.txt", sep='\t', header=None)
    preds.columns = ['sid', 'rank', 'pred', 'score']
    
    low_score[phylum] = preds.query("score < 0.9").shape[0]
    outgroups[phylum] = preds.query("score >= 0.9 and pred == 'outgroups'").shape[0]

In [7]:
n_deeper = {}
n_order = {}
n_class = {}
n_uncl = {}

for phylum in PHYLA:
    df = pd.read_csv(f"DeepCOI/lower_ranks/{phylum}.lower.txt", sep='\t', header=None)
    df.columns = ['sid', 'rank_c', 'class', 'score_c', 'rank_o', 'order', 'score_o', 'rank_f', 'family', 'score_f', 'rank_g', 'genus', 'score_g', 'rank_s', 'species', 'score_s']
    
    u = df.query("score_c < 0.9").shape[0]
    c = df.query("score_c > 0.9").shape[0]
    o = df.query("score_o > 0.9").shape[0]
    d = df.shape[0] - u - c
    
    n_deeper[phylum] = d
    n_order[phylum] = o
    n_class[phylum] = c - o
    n_uncl[phylum] = u

In [8]:
df = pd.DataFrame({
    'n_query': n_queries,
    'low_score': low_score,
    'outgroups': outgroups,
    'n_uncl': n_uncl,
    'n_class': n_class,
    'n_order': n_order,
    'n_deeper': n_deeper
})
df['FP'] = df['n_query'] - df['outgroups'] - df['low_score']

In [9]:
df

Unnamed: 0,n_query,low_score,outgroups,n_uncl,n_class,n_order,n_deeper,FP
Actinobacteria,658,635,1,5,6,11,0,22
Bacteroidetes,104,96,7,0,1,0,0,1
Cyanobacteria,50,42,7,1,0,0,0,1
Firmicutes,195,190,2,3,0,0,0,3
Proteobacteria,4550,4215,46,52,64,173,0,289


# Sup_Table_S6

In [10]:
df[['n_query', 'FP', 'n_deeper']]

Unnamed: 0,n_query,FP,n_deeper
Actinobacteria,658,22,0
Bacteroidetes,104,1,0
Cyanobacteria,50,1,0
Firmicutes,195,3,0
Proteobacteria,4550,289,0
