In [1]:
import pandas as pd

In [2]:
PDBE_EVALUES_PATH = "./data/evalues/pdbe.tsv"
FOLDSEEK_EVALUES_PATH = "./data/evalues/foldseek.tsv"
DALI_EVALUES_PATH = "./data/evalues/dali.tsv"
INTERSECTION_EVALUES_PATH = "./data/evalues/intersection.tsv"
UNION_EVALUES_PATH = "./data/evalues/union.tsv"

In [4]:
# Open the evalues files in pandas dataframes
pdb_evalues = pd.read_csv(PDBE_EVALUES_PATH, sep="\t")
foldseek_evalues = pd.read_csv(FOLDSEEK_EVALUES_PATH, sep="\t")
dali_evalues = pd.read_csv(DALI_EVALUES_PATH, sep="\t")
intersection_evalues = pd.read_csv(INTERSECTION_EVALUES_PATH, sep="\t")
union_evalues = pd.read_csv(UNION_EVALUES_PATH, sep="\t")

In [20]:
# these are the columns: evalue	f1	accuracy	precision	recall	mcc	TP	FP	TN	FN
# we want to plot the evalue vs the accuracy, precision and recall for each method
import plotly.express as px
import plotly.graph_objects as go

# Plot the evalue vs accuracy for each method (x axis must be log scale)
fig = go.Figure()   

# MCC as a line for each method, f1 as a dotted line for each method
fig.add_trace(go.Scatter(x=pdb_evalues["evalue"], y=pdb_evalues["MCC"], mode="lines", name="PDB (F1)", ))
fig.add_trace(go.Scatter(x=pdb_evalues["evalue"], y=pdb_evalues["f1"], mode="lines", name="PDB (MCC)", line=dict(dash="dot")))

# MCC as a line for each method, f1 as a dotted line for each method
fig.add_trace(go.Scatter(x=foldseek_evalues["evalue"], y=foldseek_evalues["MCC"], mode="lines", name="Foldseek (F1)", ))
fig.add_trace(go.Scatter(x=foldseek_evalues["evalue"], y=foldseek_evalues["f1"], mode="lines", name="Foldseek (MCC)", line=dict(dash="dot")))

# MCC as a line for each method, f1 as a dotted line for each method
fig.add_trace(go.Scatter(x=dali_evalues["evalue"], y=dali_evalues["MCC"], mode="lines", name="DALI (F1)", ))
fig.add_trace(go.Scatter(x=dali_evalues["evalue"], y=dali_evalues["f1"], mode="lines", name="DALI (MCC)", line=dict(dash="dot")))

# MCC as a line for each method, f1 as a dotted line for each method
fig.add_trace(go.Scatter(x=intersection_evalues["evalue"], y=intersection_evalues["MCC"], mode="lines", name="Intersection (F1)", ))
fig.add_trace(go.Scatter(x=intersection_evalues["evalue"], y=intersection_evalues["f1"], mode="lines", name="Intersection (MCC)", line=dict(dash="dot")))

#set the log scale and remove the measure units (like p,n,m)
fig.update_xaxes(type="log", tickformat=".2")
fig.update_layout( xaxis_title="E-Value", yaxis_title="", legend_title="Method")

# Remove padding around the plot
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

# Save the image as a png to the image folder
# IMAGES_FOLDER = "./doc/images/"
# fig.write_image(IMAGES_FOLDER + "evalue_vs_accuracy.png", width=800, height=400, scale=2)
fig.show()

In [21]:
# incorrectly classified proteins (false positives and false negatives) by evalue
fig = go.Figure()

# PDBe (as lines and points)
fig.add_trace(go.Scatter(x=pdb_evalues["evalue"], y=pdb_evalues["FP"] + pdb_evalues["FN"], mode="lines+markers", name="PDB"))

# Foldseek (as lines and points)
fig.add_trace(go.Scatter(x=foldseek_evalues["evalue"], y=foldseek_evalues["FP"] + foldseek_evalues["FN"], mode="lines+markers", name="Foldseek"))

# DALI (as lines and points)
fig.add_trace(go.Scatter(x=dali_evalues["evalue"], y=dali_evalues["FP"] + dali_evalues["FN"], mode="lines+markers", name="DALI"))

# Intersection (as lines and points)
fig.add_trace(go.Scatter(x=intersection_evalues["evalue"], y=intersection_evalues["FP"] + intersection_evalues["FN"], mode="lines+markers", name="Intersection"))

# Union (as lines and points)
fig.add_trace(go.Scatter(x=union_evalues["evalue"], y=union_evalues["FP"] + union_evalues["FN"], mode="lines+markers", name="Union"))

# Remove padding around the plot
fig.update_xaxes(type="log", tickformat=".2")
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.update_layout( xaxis_title="Evalue", yaxis_title="Incorrectly Classified Proteins", legend_title="Method")

# Save the image as a png to the image folder
# fig.write_image(IMAGES_FOLDER + "false_negatives_vs_false_positives.png", width=800, height=400, scale=2)

fig.show()

In [59]:
# we choose the evalue threshold #10^-6 and and look for the best model (lowest fp+fn)
EVALUE_THRESHOLD = 10**-5
pdbe_evalues_row = pdb_evalues.loc[pdb_evalues["evalue"] == EVALUE_THRESHOLD]
pdbe_score = pdbe_evalues_row["FP"].values[0] + pdbe_evalues_row["FN"].values[0]

foldseek_evalues_row = foldseek_evalues.loc[foldseek_evalues["evalue"] == EVALUE_THRESHOLD]
foldseek_score = foldseek_evalues_row["FP"].values[0] + foldseek_evalues_row["FN"].values[0]

dali_evalues_row = dali_evalues.loc[dali_evalues["evalue"] == EVALUE_THRESHOLD]
dali_score = dali_evalues_row["FP"].values[0] + dali_evalues_row["FN"].values[0]

intersection_evalues_row = intersection_evalues.loc[intersection_evalues["evalue"] == EVALUE_THRESHOLD]
intersection_score = intersection_evalues_row["FP"].values[0] + intersection_evalues_row["FN"].values[0]

union_evalues_row = union_evalues.loc[union_evalues["evalue"] == EVALUE_THRESHOLD]
union_score = union_evalues_row["FP"].values[0] + union_evalues_row["FN"].values[0]

# print the results for each method
print("PDBe: " + str(pdbe_score))
print("Foldseek: " + str(foldseek_score))
print("DALI: " + str(dali_score))
print("Intersection: " + str(intersection_score))
print("Union: " + str(union_score))


PDBe: 1
Foldseek: 1
DALI: 1
Intersection: 1
Union: 1


In [60]:
pdbe_evalues_row

Unnamed: 0,evalue,f1,accuracy,precision,recall,mcc,TP,FP,TN,FN,MCC
3,1e-05,0.998496,0.999998,1.0,0.996997,0,332,0,553177,1,0.998496


In [61]:
foldseek_evalues_row

Unnamed: 0,evalue,f1,accuracy,precision,recall,mcc,TP,FP,TN,FN,MCC
3,1e-05,0.998492,0.999998,1.0,0.996988,0,331,0,553177,1,0.998492


In [62]:
dali_evalues_row

Unnamed: 0,evalue,f1,accuracy,precision,recall,mcc,TP,FP,TN,FN,MCC
3,1e-05,0.998487,0.999998,1.0,0.996979,0.0,330,0,553177,1,0.998487


In [63]:
intersection_evalues_row

Unnamed: 0,evalue,f1,accuracy,precision,recall,mcc,TP,FP,TN,FN,MCC
3,1e-05,0.998514,0.999998,1.0,0.997033,0.0,336,0,553177,1,0.998514


In [57]:
union_evalues_row

Unnamed: 0,evalue,f1,accuracy,precision,recall,mcc,TP,FP,TN,FN,MCC
4,1e-06,0.998487,0.999998,1.0,0.996979,0.0,330,0,553177,1,0.998487
