In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [None]:
import sys

import tensorflow as tf

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
tf.__version__

In [None]:
sys.path.append("/home/dzeiberg/DeepFRI/")

In [None]:
from deepfrier.Predictor import Predictor

In [None]:
DATA_DIR = "/data/dzeiberg/DeepFRI/data/"

In [None]:
import os

import json

In [None]:
with open(os.path.join(DATA_DIR,"trained_models/model_config.json")) as json_file:
    params = json.load(json_file)

In [None]:
predictor = Predictor(os.path.join(DATA_DIR,params["gcn"]["models"]["bp"]),gcn=True)

In [None]:
featModel = tf.keras.Model(inputs=predictor.model.input,
                           outputs=predictor.model.layers[-4].output)

In [None]:
import networkx as nx

In [None]:
edgotype_train = nx.read_gpickle("data/y2hEdgotyping/edgotype_train.gpickle")
edgotype_val = nx.read_gpickle("data/y2hEdgotyping/edgotype_val.gpickle")
edgotype_test = nx.read_gpickle("data/y2hEdgotyping/edgotype_test.gpickle")

In [None]:
nodeLabels = {n:[] for n in edgotype_train.nodes()}
for u,v,edge in edgotype_train.edges(data=True):
    id_ = sorted([u,v])[0]
    y = (edge["LWAH1_f_wt"] - edge["LWAH1_f_mt"]) >= 2
    nodeLabels[id_].append(y)
    
nodeLabelsVal = {n:[] for n in edgotype_val.nodes()}
for u,v,edge in edgotype_val.edges(data=True):
    id_ = sorted([u,v])[0]
    y = (edge["LWAH1_f_wt"] - edge["LWAH1_f_mt"]) >= 2
    nodeLabelsVal[id_].append(y)

I should make this node pair specific instead of grouped across all partners and variants

In [None]:
plt.scatter(*zip(*[(np.log10(len(adjEdges)),np.mean(adjEdges)) for adjEdges in nodeLabels.values() if len(adjEdges)]))
plt.scatter(*zip(*[(np.log10(len(adjEdges)),np.mean(adjEdges)) for adjEdges in nodeLabelsVal.values() if len(adjEdges)]),alpha=.5)
plt.xlabel(r"$log_{10}(deg(n))$")
plt.ylabel("Edge Prior")

In [None]:
import scipy
from tqdm.notebook import tqdm
def makeFeats(G,FEATURE_SET="mutpredFeatures"):
    CMT = 10.0
    X = []
    y = []
    for edgeNum,(ensg_i,ensg_j,edge) in tqdm(enumerate(G.edges(data=True)),
                                   total=G.number_of_edges()):
        node_i = G.nodes[ensg_i]
        node_j = G.nodes[ensg_j]
        yij = np.any([(edge[f"{lvl}_wt"] - edge[f"{lvl}_mt"]) >= 2 for lvl in ["LWH1_f",
                                                                              "LWH10_f",
                                                                              "LWH25_f",
                                                                              "LWA_f",
                                                                              "LWAH1_f"]])
        y.append(yij)
        if FEATURE_SET == "deepFRI" and len(node_i["alphafoldStructures"]) and len(node_j["alphafoldStructures"]):
            Ai,Si,seq_i = predictor._load_cmap(node_i["alphafoldStructures"][0],
                                               cmap_thresh=CMT)
            Aj,Sj,seq_j = predictor._load_cmap(node_j["alphafoldStructures"][0],
                                               cmap_thresh=CMT)
            Xi = featModel([Ai,Si],training=False)
            Xj = featModel([Aj,Sj],training=False)
            X.append(np.concatenate((Xi,Xj),axis=-1))
        elif FEATURE_SET == "mutpredFeatures":
            fnum = str(int(edge["featFileNum"]))
            pth = f"/data/dzeiberg/ppi/y2hEdgotyping/mutpred2Results/variants.faa.out.feats_{fnum}"
            MPFeats = scipy.io.loadmat(pth)["feats"]
            mutationFeat = np.array(MPFeats[int(edge["fileRowNum"])]).reshape((1,-1))
            X.append(mutationFeat)
        elif FEATURE_SET == "mutpredScore":
            X.append(np.array([edge["MutPred2 score"]]).reshape((1,-1)))
    X = np.concatenate(X)
    return X,np.array(y).astype(float)

In [None]:
XTrain,yTrain = makeFeats(edgotype_train,FEATURE_SET="mutpredScore")

In [None]:
XTrain.shape,yTrain.shape

In [None]:
XVal,yVal = makeFeats(edgotype_val,FEATURE_SET="mutpredScore")

In [None]:
XVal.shape,yVal.shape

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
XTr = scaler.fit_transform(XTrain)
XV = scaler.transform(XVal)

In [None]:
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
import itertools

In [None]:
if XTr.shape[1] == 1:
    print(roc_auc_score(yVal,XV),roc_auc_score(yTrain,XTr))

In [None]:
# MODEL = "nn"
# MODEL = "rf"
MODEL = "lr"

In [None]:
if MODEL == "rf":
    clf = RandomForestClassifier(n_jobs=16)
    clf.fit(XTr,yTrain)
elif MODEL == "lr":
    clf = LogisticRegression(max_iter=1000)
    clf.fit(XTr,yTrain)
elif MODEL == "nn":
    clf = tf.keras.Sequential(sum(itertools.repeat((tf.keras.layers.Dense(64),
                                 tf.keras.layers.BatchNormalization(),
                                 tf.keras.layers.ReLU()),3),())+ (tf.keras.layers.Dense(1,activation=None),))

    clf.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
               metrics=[tf.keras.metrics.AUC(from_logits=True)])

    clf.fit(XTr,yTrain,
            validation_data=(XVal,yVal),
            batch_size=32,epochs=100)
else:
    raise ValueError("Invalid Model")

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
valPreds = clf.predict_proba(XV)[:,1]

In [None]:
roc_auc_score(yVal, valPreds)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(valPreds[yVal.astype(bool)])
plt.hist(valPreds[~yVal.astype(bool)],color="red",alpha=.5)