# Comparison of TANGO, Aggrescan and Aggrescan3D

In [None]:
#Import stuff
import os
import re
import sys
import time
import math
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import WeightedRandomSampler
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sn
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from scipy.stats import linregress
from sklearn.model_selection import GroupKFold
import pickle
from statistics import mean


In [None]:
#Load experimental values
exp = pd.read_csv("../4_FineTuning/jain_full.csv", sep=";")
exp.dropna(inplace=True)
#min seq ID for clustering: 0.65
clusters = pd.read_csv("../4_FineTuning/ABDB_clu_80.tsv", sep="\t",  header=None)
clusters= clusters.rename(columns={0: 'rep', 1 :'id'})
#Make a cluster dictionary
cluster_temp_dict = {}
cluster_dict = {}
count = 0
for i, row in clusters.iterrows():
    if row["rep"] in cluster_temp_dict:
        cluster_dict[row["id"]] = cluster_temp_dict[row["rep"]]
    else:
        cluster_temp_dict[row["rep"]] = count 
        count += 1
        cluster_dict[row["id"]] = cluster_temp_dict[row["rep"]]
        
print(f"Total amount of clusters: {count}")
#Append cluster info to df
clusters = []
for i, row in exp.iterrows():
    name = row["Name"]
    clusters.append(cluster_dict[name])
exp["cluster"] = clusters
#Normalize AC-SINS
norm_ac = [(data - min(exp["AC-SINS"])) / (max(exp["AC-SINS"]) - min(exp["AC-SINS"])) for data in exp["AC-SINS"]]
exp["norm_AC-SINS"] = norm_ac

#Normalize HIC
norm_hic = [(data - min(exp["HIC"])) / (max(exp["HIC"]) - min(exp["HIC"])) for data in exp["HIC"]]
exp["norm_HIC"] = norm_hic

#Add fake labels for testing
rng = np.random.default_rng(12345)
rand = rng.random(len(norm_ac))
exp["fake"] = rand

#Binary classifictaion
bc = [0 if val <= 5 else 1 for val in exp["AC-SINS"]]
exp["BC"] = bc

In [None]:
exp

In [None]:
#Load results from aggregation predictors 
agc = pd.read_csv("./AggrescanComparison.csv", sep=";")
agc = agc.dropna(axis=0, how='all')
agc = agc.dropna(axis=1, how='all')
agc.drop(agc.tail(2).index,inplace=True)

In [None]:
agc

In [None]:
def normalize(lst, mx, mn):

    output =  [mn + 1 * (x - min(lst)) / (max(lst) - min(lst)) for x in lst]
    return output

In [None]:
a3v_auc = []
a3v_acc = []
a3v_mcc = []
a3d_auc = []
a3d_acc = []
a3d_mcc = []
tango_auc = []
tango_acc = []
tango_mcc = []


a3v = list(agc["a3v Sequence Average (a3vSA):"])
a3vnn = normalize(a3v,1,0)
a3d = list(agc["Average Aggrescan3D score"])
a3dnn = normalize(a3d,1,0)
tango = list(agc["TANGO"])
tangonn = normalize(tango,1,0)
bc = list(agc["BC"])
bc = [int(x) for x in bc]

for i in range(-1,102):
    a3vn = [0 if x <= i/100 else 1 for x in a3vnn]
    a3dn = [0 if x <= i/100 else 1 for x in a3dnn]
    tangon = [0 if x <= i/100 else 1 for x in tangonn]
    
    
    a3v_mcc.append(matthews_corrcoef(bc,a3vn))
    a3v_acc.append(accuracy_score(bc,a3vn))
    a3v_auc.append(roc_auc_score(bc,a3vnn))
    
    a3d_mcc.append(matthews_corrcoef(bc,a3dn))
    a3d_acc.append(accuracy_score(bc,a3dn))
    a3d_auc.append(roc_auc_score(bc,a3dnn))
    
    tango_mcc.append(matthews_corrcoef(bc,tangon))
    tango_acc.append(accuracy_score(bc,tangon))
    tango_auc.append(roc_auc_score(bc,tangonn))

In [None]:
with open("COMPARISON.txt","w") as outfile:
    outfile.write("#### a3vSA ####\n")
    outfile.write(f"Max ACC: {max(a3v_acc)}\n")
    outfile.write(f"Max AUC: {max(a3v_auc)}\n")
    outfile.write(f"Max MCC: {max(a3v_mcc)}\n")
    outfile.write("#### A3D ####\n")
    outfile.write(f"Max ACC: {max(a3d_acc)}\n")
    outfile.write(f"Max AUC: {max(a3d_auc)}\n")
    outfile.write(f"Max MCC: {max(a3d_mcc)}\n")
    outfile.write("#### TANGO ####\n")
    outfile.write(f"Max ACC: {max(tango_acc)}\n")
    outfile.write(f"Max AUC: {max(tango_auc)}\n")
    outfile.write(f"Max MCC: {max(tango_mcc)}\n")

In [None]:
x = [y/100 for y in list(range(-1,102))]
#Make pretty plot
plt.rcParams['figure.figsize'] = [10, 15]   
plt.rcParams['font.size']=15

#Initialize plot
fig, (ax1, ax2) = plt.subplots(2, 1)
fig.patch.set_facecolor('#FAFAFA')
fig.patch.set_alpha(0.7)
fig.suptitle("Performance Across Aggregation Proneness Threshold \n Aggrescan, Aggrescan3D and TANGO")

#MCC
ax1.plot(x,tango_mcc, c = "green", label =f"TANGO (Max value: {round(max(tango_mcc),3)}) ")
ax1.plot(x,a3d_mcc, c = "red", label =f"Aggrescan3D (Max value: {round(max(a3d_mcc),3)}) ")
ax1.plot(x,a3v_mcc, c = "blue", label =f"Aggrescan (Max value: {round(max(a3v_mcc),3)}) ")
ax1.grid()
ax1.set_xlabel("Aggregation proneness threshold for normalized prediction values")
ax1.set_ylabel("MCC")
ax1.legend()

#ACC
ax2.plot(x,tango_acc, c = "green", label =f"TANGO (value at max MCC: {round(tango_acc[tango_mcc.index(max(tango_mcc))],3)}) ")
ax2.plot(x,a3d_acc, c = "red", label =f"Aggrescan3D (value at max MCC: {round(a3d_acc[a3d_mcc.index(max(a3d_mcc))],3)}) ")
ax2.plot(x,a3v_acc, c = "blue", label =f"Aggrescan (value at max MCC: {round(a3v_acc[a3v_mcc.index(max(a3v_mcc))],3)}) ")

ax2.grid()
ax2.legend()
ax2.set_ylabel("Accuracy")
ax2.set_xlabel("Aggregation proneness threshold for normalized prediction values")
fig.tight_layout(pad = 1)

fig.tight_layout(pad = 1)
fig.savefig(f'./COMPARISON.png', facecolor=fig.get_facecolor(), edgecolor='none')
#plt.show()



In [None]:
#Save venn diagram of data
plt.figure(figsize=(10,10))
v =venn2(subsets = (11049, 9349,2220), set_labels = ('PSI-Biology Dataset', 'NESG Dataset'))
v.get_patch_by_id('100').set_alpha(0.5)
v.get_patch_by_id('10').set_color('tab:orange')
v.get_patch_by_id('01').set_color('dodgerblue')
plt.title("Collected database for pre-training", fontsize = 25)
#plt.savefig(f'../0_DataPreprocessing/venn1.png', facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
#Save venn diagram of data
plt.figure(figsize=(10,10))
v =venn2(subsets = (2005, 2072,443), set_labels = ('PSI-Biology Dataset', 'NESG Dataset'))
v.get_patch_by_id('100').set_alpha(0.5)
v.get_patch_by_id('10').set_color('tab:orange')
v.get_patch_by_id('01').set_color('dodgerblue')
plt.title("AlphaFold represented structures", fontsize = 25)
#plt.savefig(f'../0_DataPreprocessing/venn2.png', facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
#Plot correlation of aggregation predictions and AC-SINS values
acsins = list(agc["AC-SINS"])
x = [y/100 for y in list(range(-1,102))]

#Make pretty plot
plt.rcParams['figure.figsize'] = [10, 15]   
plt.rcParams['font.size']=15

#Initialize plot
fig, (ax1, ax2,ax3) = plt.subplots(3, 1)
fig.patch.set_facecolor('#FAFAFA')
fig.patch.set_alpha(0.7)
fig.suptitle("Correlation of AC-SINS values and aggregation predictions \n Aggrescan, Aggrescan3D and TANGO")

#MCC
r1,p = spearmanr(acsins,tango)
r2,p = spearmanr(acsins,a3d)
r3,p = spearmanr(acsins,a3v)

ax1.scatter(acsins,tango, c = "green", label =f"TANGO (Spearmanr: {round(r1,3)}) ")
ax2.scatter(acsins,a3d, c = "red", label =f"Aggrescan3D (Spearmanr: {round(r2,3)}) ")
ax3.scatter(acsins,a3v, c = "blue", label =f"Aggrescan (Spearmanr: {round(r3,3)}) ")

ax1.grid()
ax2.grid()
ax3.grid()

ax1.set_xlabel("AC-SINS")
ax2.set_xlabel("AC-SINS")
ax3.set_xlabel("AC-SINS")

ax1.set_ylabel("TANGO Prediction")
ax2.set_ylabel("Aggrescan3D Prediction")
ax3.set_ylabel("Aggrescan Prediction")

ax1.legend(loc ="upper center")
ax2.legend(loc ="upper center")
ax3.legend(loc ="upper center")


fig.tight_layout(pad = 1)


fig.savefig(f'./COMPARISON_correlation.png', facecolor=fig.get_facecolor(), edgecolor='none')
#plt.show()

