# Load Data

In [None]:
import networkx as nx
import pandas as pd

In [None]:
df = pd.read_csv("data/y2hEdgotyping/y2hWithMutPred2Info.csv",index_col=0)

In [None]:
df.head()

In [None]:
df = df.assign(delta1 = df.LWH1_f_wt - df.LWH1_f_mt,
               delta2 = df.LWH10_f_wt - df.LWH10_f_mt,
               delta3 = df.LWH25_f_wt - df.LWH25_f_mt,
              delta4 = df.LWA_f_wt - df.LWA_f_mt,
              delta5 = df.LWAH1_f_wt - df.LWAH1_f_mt)
df = df.assign(y = df.apply(lambda r: np.any([v >= 2 for v in [r.delta1, r.delta2, r.delta3,
                                                               r.delta4, r.delta5]]),
                            axis=1))

In [None]:
edgotype_train = nx.read_gpickle("data/y2hEdgotyping/edgotype_train.gpickle")
edgotype_val = nx.read_gpickle("data/y2hEdgotyping/edgotype_val.gpickle")
edgotype_test = nx.read_gpickle("data/y2hEdgotyping/edgotype_test.gpickle")

In [None]:
next(iter(edgotype_train.nodes(data=True)))

In [None]:
edgotype_train.edges("ENSG00000185900",data=True)

# Can I train a classifier to predict whether a variant will lead to any loss of PPI?

In [None]:
import itertools

In [None]:
import scipy
from tqdm.notebook import tqdm
def makeFeats(G,FEATURE_SET="mutpredFeatures",groupFilter=None):
    groupedEdges = [list(g) for k,g in itertools.groupby(sorted(G.edges(data=True),
                                                 key=lambda t: t[2]["db_mut_id_mt"]),
                                           key=lambda t: t[2]["db_mut_id_mt"])]
    X = []
    y = []
    for edgeGroup in groupedEdges:
        if groupFilter is not None and not groupFilter(edgeGroup):
            continue
        yi = np.any([np.any([(edge[f"{lvl}_wt"] - edge[f"{lvl}_mt"]) >= 2 for lvl in ["LWH1_f",
                                                                                  "LWH10_f",
                                                                                  "LWH25_f",
                                                                                  "LWA_f",
                                                                                  "LWAH1_f"]]) for edge in [e for (i,j,e) in edgeGroup]])
        y.append(yi)
        
        ensg_i,ensg_j,edge = edgeGroup[0]
        node_i = G.nodes[ensg_i]
        node_j = G.nodes[ensg_j]
        if FEATURE_SET == "deepFRI" and len(node_i["alphafoldStructures"]) and len(node_j["alphafoldStructures"]):
            Ai,Si,seq_i = predictor._load_cmap(node_i["alphafoldStructures"][0],
                                               cmap_thresh=CMT)
            Aj,Sj,seq_j = predictor._load_cmap(node_j["alphafoldStructures"][0],
                                               cmap_thresh=CMT)
            Xi = featModel([Ai,Si],training=False)
            Xj = featModel([Aj,Sj],training=False)
            X.append(np.concatenate((Xi,Xj),axis=-1))
        elif FEATURE_SET == "mutpredFeatures":
            fnum = str(int(edge["featFileNum"]))
            pth = f"/data/dzeiberg/ppi/y2hEdgotyping/mutpred2Results/variants.faa.out.feats_{fnum}"
            MPFeats = scipy.io.loadmat(pth)["feats"]
            mutationFeat = np.array(MPFeats[int(edge["fileRowNum"])]).reshape((1,-1))
            X.append(mutationFeat)
        elif FEATURE_SET == "mutpredScore":
            X.append(np.array([edge["MutPred2 score"]]).reshape((1,-1)))
    X = np.concatenate(X)
    return X,np.array(y).astype(float)

In [None]:
XTrain,yTrain = makeFeats(edgotype_train,"mutpredFeatures")#,
#                          groupFilter=lambda l: len(l) == 1)

XVal,yVal = makeFeats(edgotype_val,"mutpredFeatures")#,
#                      groupFilter=lambda l: len(l) == 1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
XTr = scaler.fit_transform(XTrain)
XV = scaler.transform(XVal)

from sklearn.decomposition import PCA

pca = PCA(n_components=16)
XTr = pca.fit_transform(XTr)
XV = pca.transform(XV)

In [None]:
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# MODEL = "nn"
MODEL = "rf"
# MODEL = "lr"

In [None]:
if MODEL == "rf":
    clf = RandomForestClassifier(n_jobs=16)
    clf.fit(XTr,yTrain)
elif MODEL == "lr":
    clf = LogisticRegression(max_iter=1000)
    clf.fit(XTr,yTrain)
elif MODEL == "nn":
    clf = tf.keras.Sequential(sum(itertools.repeat((tf.keras.layers.Dense(64),
                                 tf.keras.layers.BatchNormalization(),
                                 tf.keras.layers.ReLU()),3),())+ (tf.keras.layers.Dense(1,activation=None),))

    clf.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
               metrics=[tf.keras.metrics.AUC(from_logits=True)])

    clf.fit(XTr,yTrain,
            validation_data=(XVal,yVal),
            batch_size=32,epochs=100)
else:
    raise ValueError("Invalid Model")

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
valPreds = clf.predict_proba(XV)[:,1]

In [None]:
roc_auc_score(yVal, valPreds)

In [None]:
trainEnsemblIDs = set(list(edgotype_train.nodes()))
valEnsemblIDs = set(list(edgotype_val.nodes()))
testEnsemblIDs = set(list(edgotype_test.nodes()))

In [None]:
dfTrain = df[(df.db_ensembl_gene_id_mt.isin(trainEnsemblIDs)) & \
             (df.ad_ensembl_gene_id_mt.isin(trainEnsemblIDs))]

dfVal = df[(df.db_ensembl_gene_id_mt.isin(valEnsemblIDs)) & \
             (df.ad_ensembl_gene_id_mt.isin(valEnsemblIDs))]

dfTest = df[(df.db_ensembl_gene_id_mt.isin(testEnsemblIDs)) & \
             (df.ad_ensembl_gene_id_mt.isin(testEnsemblIDs))]

In [None]:
def makeVariantTable(df):
    vt = pd.merge(pd.merge(pd.merge(df.groupby(["db_ensembl_gene_id_mt","db_mut_id_mt"]).y.aggregate(np.any),
        df.groupby(["db_ensembl_gene_id_mt","db_mut_id_mt"])["MutPred2 score"].max(),
        left_index=True,right_index=True),
                          df.groupby(["db_ensembl_gene_id_mt",
                                         "db_mut_id_mt"]).ad_orf_id.count(),
                          left_index=True,right_index=True),
                             df.groupby(["db_ensembl_gene_id_mt",
                                              "db_mut_id_mt"]).clinical_significance_mt.unique().apply(lambda l: l[0]),
                             left_index=True,right_index=True)
    return vt.rename({"ad_orf_id":"n_edges"},axis=1)

In [None]:
variantTableTrain = makeVariantTable(dfTrain)

In [None]:
variantTableTrain

In [None]:
variantTableTrain[variantTableTrain["clinical_significance_mt"] == "Pathogenic"].y.value_counts()

In [None]:
variantTableTrain[variantTableTrain.y].clinical_significance_mt.value_counts()

In [None]:
variantTableVal = makeVariantTable(dfVal)

In [None]:
variantTableVal

In [None]:
variantTableTest = makeVariantTable(dfTest)

In [None]:
variantTableTest

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
table = pd.concat((variantTableTrain,variantTableVal,variantTableTest))

In [None]:
table

# Disease-Causing Mutations Association with PPI Loss

In [None]:
table[table["clinical_significance_mt"] == "Pathogenic"].y.value_counts(normalize=True)

In [None]:
table[(table["clinical_significance_mt"] == "Pathogenic") & (table.n_edges > 1)].y.value_counts(normalize=True)

In [None]:
table[(table["clinical_significance_mt"] == "Pathogenic") & (table.n_edges == 1)].y.value_counts(normalize=True)

# Clinical Annotations of Interaction Perturbing Variants

In [None]:
table[table.y].clinical_significance_mt.value_counts(normalize=True)

# Overall Performance

In [None]:
table.sort_values(by="MutPred2 score",ascending=False)

# All variants

In [None]:
roc_auc_score(table.y, table["MutPred2 score"])

In [None]:
plt.hist(table[table.y]["MutPred2 score"].values,label="Edgetic/Quasi-null",density=True)
plt.hist(table[~table.y]["MutPred2 score"].values,color="red",alpha=.5,label="Quasi-WT",density=True)
plt.legend()
plt.xlabel("MutPred2 Score")
plt.ylabel("Density")

# Variants on WT proteins with single partner 

In [None]:
roc_auc_score(table[table.n_edges == 1].y, 
              table[table.n_edges == 1]["MutPred2 score"])

In [None]:
plt.hist(table[(table.y) & (table.n_edges == 1)]["MutPred2 score"].values,label="Edgetic/Quasi-null",density=True)
plt.hist(table[(~table.y)& (table.n_edges == 1)]["MutPred2 score"].values,color="red",alpha=.5,label="Quasi-WT",density=True)
plt.legend()
plt.xlabel("MutPred2 Score")
plt.ylabel("Density")

## Variants on WT proteins with multiple partners

In [None]:
roc_auc_score(table[table.n_edges > 1].y, 
              table[table.n_edges > 1]["MutPred2 score"])

In [None]:
plt.hist(table[(table.y) & \
               (table.n_edges > 1)]["MutPred2 score"].values,
         label="Edgetic/Quasi-null",density=True)
plt.hist(table[(~table.y) & \
               (table.n_edges > 1)]["MutPred2 score"].values,
         color="red",alpha=.5,label="Quasi-WT",density=True)
plt.legend()
plt.xlabel("MutPred2 Score")
plt.ylabel("Density")

# Performance per Split

## All variants

### Train

In [None]:
roc_auc_score(variantTableTrain.y, 
              variantTableTrain["MutPred2 score"])

### Val

In [None]:
roc_auc_score(variantTableVal.y, 
              variantTableVal["MutPred2 score"])

### Test

In [None]:
roc_auc_score(variantTableTest.y, 
              variantTableTest["MutPred2 score"])

## Only variants on WT nodes with single partner

### Train

In [None]:
roc_auc_score(variantTableTrain[variantTableTrain.n_edges == 1].y, 
              variantTableTrain[variantTableTrain.n_edges == 1]["MutPred2 score"])

### Val

In [None]:
roc_auc_score(variantTableVal[variantTableVal.n_edges == 1].y, 
              variantTableVal[variantTableVal.n_edges == 1]["MutPred2 score"])

### Test

In [None]:
roc_auc_score(variantTableTest[variantTableTest.n_edges == 1].y, 
              variantTableTest[variantTableTest.n_edges == 1]["MutPred2 score"])

## Only variants on WT nodes with multiple partners

### Train

In [None]:
roc_auc_score(variantTableTrain[variantTableTrain.n_edges > 1].y, 
              variantTableTrain[variantTableTrain.n_edges > 1]["MutPred2 score"])

### Val

In [None]:
roc_auc_score(variantTableVal[variantTableVal.n_edges > 1].y, 
              variantTableVal[variantTableVal.n_edges > 1]["MutPred2 score"])

### Test

In [None]:
roc_auc_score(variantTableTest[variantTableTest.n_edges > 1].y, 
              variantTableTest[variantTableTest.n_edges > 1]["MutPred2 score"])

In [None]:
next(iter(edgotype_train.edges(data=True)))