# Imports

In [None]:
import pandas as pd
import numpy as np

## Levels to handle

In [None]:
levels = ["domain", "class", "order", "family", "genus", "species"]

## Select type of splited data

In [None]:
splitter =  "IsolatedRandomSplit"

## Load files of feature-classifier predictions

In [None]:
results_paths = { 
    "class":    "./"+splitter+"/class/results/data/taxonomy.tsv", 
    "order":    "./"+splitter+"/order/results/data/taxonomy.tsv", 
    "family":   "./"+splitter+"/family/results/data/taxonomy.tsv", 
    "genus":    "./"+splitter+"/genus/results/data/taxonomy.tsv",
    "species":  "./"+splitter+"/species/results/data/taxonomy.tsv"
}

## Format taxonomic classification the data and split it in levels columns

In [None]:
def SplitLevels(row):
    classifications = row["Taxon"].split("; ")

    levels_columns = {
        "d__":"domain", 
        "c__":"class", 
        "o__":"order", 
        "f__":"family", 
        "g__":"genus", 
        "s__":"species"
    }

    for c in classifications:
        if c[3:] == "" or c[:3] == "Una":
            continue
        
        row[levels_columns[c[0:3]]] = c[3:]
    
    return row

In [None]:
def LoadLevelsColumns(dataset, level):
    for l in levels:
        dataset.insert(dataset.shape[1], l, np.nan)
        if l == level:
            break
    
    dataset = dataset.apply(SplitLevels, axis=1).dropna(axis="columns", how="all")
    return dataset

## Check each row if the prediction matches the reference

In [None]:
def CheckResults(row, level):
    row["correct"] = True

    for l in levels:
        if not l+"_pred" in row.index:
            continue
        row["correct"] = (row[l+"_pred"] == row[l+"_ref"]) and row["correct"]

        if l == level:
            break
    
    return row

## Execute results analysis for each level

In [None]:
for level, result_path in results_paths.items():
    
    results = pd.read_csv(result_path, sep="\t", index_col=0)
    results = LoadLevelsColumns(results, level)

    reference = pd.read_csv("../new_data/"+splitter+"/"+level+"/pr2_test_taxonomy.txt", names=["Feature ID", "Taxon"], sep="\t", index_col=0)
    reference = LoadLevelsColumns(reference, level)

    data = results.join(reference, lsuffix="_pred", rsuffix="_ref")
    data = data.apply(CheckResults, axis=1, level=level)

    print("Level: "+level)
    print("Total of sequences: "+str(data.shape[0]))
    print("Correct predictions: "+str(data.loc[data["correct"]].shape[0]))
    print("Wrong predictions: "+str(data.loc[~data["correct"]].shape[0]))
    print("Accuracy: "+str(data.loc[data["correct"]].shape[0]/data.shape[0]))
    print("\n")
