In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.metrics import roc_curve, auc
import imageio

In [2]:
# analysis_type = "Single-Family-Cross-Genus"
# analysis_type = "Single-Genus-Cross-Species"
analysis_type = "Single-Family-Cross-Species"
data_file_name = f"../../data_temp/{analysis_type}.csv"

df = pd.read_csv(data_file_name, dtype={'Mask':'str'})

In [3]:
# File Name will be "../../data_temp/Cross-Species/<SPECIES-NAME>/<FILE>"

def obtain_species_name(filename):
    try:
        return (filename.split(f'/{analysis_type}/')[1]).split('/')[0]
    except:
        print(filename)

# print(obtain_species_name('../../data_temp/Single-Family-Cross-Species/Streptococcus-mutans/GCA_012641825.1.fna'))
# print(obtain_species_name('../../data_temp/Cross-Species/aiwfawgfoafabgaw/9.fna'))

In [4]:
spaced_x = []
spaced_y = []

nonspaced_x = []
nonspaced_y = []

ecdf_spaced_filenames = []
ecdf_nonspaced_filenames = []

masks = set(df["Mask"])
for mask in masks:
    if mask == "Mask":
        continue
    print(mask)
    mask_filtered_df = df[df["Mask"] == mask]
    mask_char_count = len(re.findall('(?=1)',str(mask)))/2
    mask_double_counts = len(re.findall('(?=11)',str(mask)))
    spaced = (False if mask_double_counts + 1 == mask_char_count * 2 else True)
    print(mask_char_count, mask_double_counts,mask,spaced)
    y_true = []
    y_prob = []
    for row in mask_filtered_df.to_numpy():
        y_true.append(
            1 if obtain_species_name(row[0]) == obtain_species_name(row[1]) else 0
        )
        y_prob.append(float(row[2]))
    print(y_true[:50])
    print(y_prob[:50])
    fpr, tpr, _ = roc_curve(y_true,y_prob)
    roc_auc = auc(fpr, tpr)

    auroc_filename = f"../../plots/AUROC/{analysis_type}/AUROC-{analysis_type}-{spaced=}-{mask_char_count}mer.png"
    ecdf_filename = f"../../plots/AUROC/{analysis_type}/ECDF-{analysis_type}-{spaced=}-{mask_char_count}mer.png"

    if spaced:
        spaced_x.append(mask_char_count)
        spaced_y.append(roc_auc)
        ecdf_spaced_filenames.append(ecdf_filename)

    else:
        nonspaced_x.append(mask_char_count)
        nonspaced_y.append(roc_auc)
        ecdf_nonspaced_filenames.append(ecdf_filename)




    

    plt.figure(figsize=(10,10))
    plt.plot(fpr, tpr, color="orange", lw=2, label=f"area = {roc_auc}")
    plt.plot([0,1],[0,1], color="navy", lw=2, linestyle='-')
    plt.xlim([0.0, 0.3])
    plt.ylim([0.7, 1.01])
    plt.ylabel("False Positive Rate")
    plt.xlabel("True Positive Rate")
    plt.title(f"AUROC for ")
    plt.legend(loc="lower right")
    plt.savefig(auroc_filename)
    plt.close()

    plt.figure(figsize=(10,10))
    plt.ecdf(y_prob, label="ECDF Total")
    plt.ecdf([y for y,ty in zip(y_prob,y_true) if ty == 0], label="ECDF False")
    plt.ecdf([y for y,ty in zip(y_prob,y_true) if ty == 1], label="ECDF True")
    plt.legend(loc="upper left")

    plt.xlim([0.7, 1.0])
    plt.ylim([0.0, 1.0])
    plt.title(f"ECDF in {analysis_type} for kmer-size = {mask_char_count} with respect to reported ANI")
    plt.xlabel("Reported ANI")
    plt.savefig(ecdf_filename)
    plt.close()

plt.figure(figsize=(10,10))
plt.scatter(spaced_x, spaced_y, label=f"Spaced k-mers")
plt.scatter(nonspaced_x, nonspaced_y, label=f"Contiguous k-mers")
plt.ylabel("AUROC")
plt.xlabel("Length of k-mer")
plt.title(f"AUROC against Length of k-mer")
plt.legend(loc="lower right")
plt.savefig(f"../../plots/AUROC/{analysis_type}/{analysis_type}-AUROC-by-length.png")
plt.close()



00000000000000000000000000000000000000000000000000000000000000000000000011110011000011111111111100001100111111111100111100001100
18.0 29 00000000000000000000000000000000000000000000000000000000000000000000000011110011000011111111111100001100111111111100111100001100 True
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1.0, 0.956262, 0.999547, 0.992508, 0.999525, 0.956895, 0.999973, 0.99996, 0.999529, 0.993586, 0.990626, 0.992906, 0.992768, 0.996671, 0.956281, 0.952748, 0.999953, 0.999982, 0.991207, 0.999384, 0.722514, 0.722514, 0.713198, 0.722514, 0.722514, 0.721953, 0.721953, 0.720808, 0.722514, 0.723068, 0.722514, 0.722514, 0.721953, 0.721384, 0.723068, 0.723068, 0.723068, 0.721953, 0.723068, 0.720224, 0.818307, 0.817083, 0.817289, 0.817358, 0.812534, 0.817289, 0.81722, 0.817767, 0.814172, 0.817289]
000000000000000000000000000000000000000000000000000000000000000011111111111111111111

In [5]:
ecdf_spaced_gif_filename = f"../../plots/AUROC/{analysis_type}/ECDF-{analysis_type}-spaced.gif"
ecdf_spaced_filenames.sort()
ecdf_spaced_files = [
    imageio.v3.imread(ecdf_spaced_filename) for ecdf_spaced_filename in ecdf_spaced_filenames
]
imageio.mimsave(ecdf_spaced_gif_filename,ecdf_spaced_files,duration=200)

ecdf_nonspaced_gif_filename = f"../../plots/AUROC/{analysis_type}/ECDF-{analysis_type}-nonspaced.gif"
ecdf_nonspaced_filenames.sort()
ecdf_nonspaced_files = [
    imageio.v3.imread(ecdf_nonspaced_filename) for ecdf_nonspaced_filename in ecdf_nonspaced_filenames
]
imageio.mimsave(ecdf_nonspaced_gif_filename,ecdf_nonspaced_files,duration=200)
