In [None]:
import torch
from transformers import AlbertModel, AlbertTokenizer
import re
import os
import requests
from tqdm.auto import tqdm

In [None]:
tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False)

In [None]:
model = AlbertModel.from_pretrained("Rostlab/prot_albert")

In [None]:
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
model = model.to(device)
model = model.eval()

In [None]:
import networkx as nx
edgotype = nx.read_gexf("data/y2hEdgotyping/edgotype.gefx")

In [None]:
import os
import pandas as pd
seqFiles = [pd.read_csv(f"data/y2hEdgotyping/uniprotScan/sequence_{i}.tsv",delimiter="\t") for i in range(6)]

uniprotMatches = pd.concat(seqFiles)
def mergeWithUniprot(graph):
    for node in graph.nodes(data=True):
        seq = node[1]["seq"]
        up = uniprotMatches[(uniprotMatches.Sequence == seq) & \
                            (uniprotMatches.Reviewed == "reviewed") & \
                           (uniprotMatches.Organism == "Homo sapiens (Human)")]
        graph.nodes[node[0]]["uniprotMatches"] = up
        alphafoldStructures = []
        for uniprot_id in graph.nodes[node[0]]["uniprotMatches"]["Entry"]:
            fp = f"/data/dzeiberg/alphafold/predictions/AF-{uniprot_id}-F1-model_v4.pdb.gz"
            if os.path.isfile(fp):
                alphafoldStructures.append(fp)
        graph.nodes[node[0]]["alphafoldStructures"] = alphafoldStructures
    return graph
edgotype_x = mergeWithUniprot(edgotype)

In [None]:
def makeMut(seq,mut):
    og,loc,var = mut[0],int(mut[1:-1]) - 1,mut[-1]
    return seq[:loc] + var + seq[loc+1:]

In [None]:
sequences,ensg_ids,substitutions = list(zip(*[(" ".join(list(makeMut(edgotype_x.nodes(data=True)[e["db_ensembl_gene_id_mt"]]["seq"],
                                                        e["Substitution"]))),
                                   e["db_ensembl_gene_id_mt"],
                                   e["Substitution"]) for _,_,e in edgotype_x.edges(data=True)]))

In [None]:
# sequences,ensg_ids = list(zip(*[(" ".join(list(n["seq"])),ensg) for ensg,n in edgotype_x.nodes(data=True)]))

In [None]:
sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences]

In [None]:
len(sequences)

In [None]:
len(sub)

In [None]:
ids = tokenizer.batch_encode_plus(sequences, add_special_tokens=True, padding='longest')

In [None]:
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

In [None]:
import torch.utils.data as data_utils

ds = data_utils.TensorDataset(input_ids,attention_mask)
loader = data_utils.DataLoader(ds, batch_size=8, shuffle=False)

In [None]:
len(ds)

In [None]:
len(loader)

In [None]:
embeddings = []
for (inp_id, inp_att_mask) in tqdm(loader,total=len(loader)):
    with torch.no_grad():    
        embeddings.append(model(input_ids=inp_id,attention_mask=inp_att_mask)[0].cpu().numpy())

In [None]:
embeddingsMat = np.concatenate(embeddings)

In [None]:
features = [] 
for seq_num in range(len(embeddingsMat)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embeddingsMat[seq_num][1:seq_len-1]
    features.append(seq_emd)

In [None]:
for ensg,sub_i,f in zip(ensg_ids,substitutions,features):
    fp = f"/data/dzeiberg/ppi/y2hEdgotyping/protAlbertEmbeddings/{ensg}_{sub_i}.pt"
    print(fp)
    torch.save(f,fp)

In [None]:
def calc_score(e):
    score = 0
    for med in ["LWH1_f_","LWH10_f_", "LWH25_f_",
                "LWA_f_","LWAH1_f_"]:
        score +=  e[med+"wt"] - e[med+"mt"]
    return score

def calc_label(e):
    label = False
    for med in ["LWH1_f_","LWH10_f_", "LWH25_f_",
                "LWA_f_","LWAH1_f_"]:
        label = label or (e[med+"wt"] - 2 >= e[med+"mt"])
    return label

In [None]:
import os

In [None]:
files = []
scores = []
labels = []
for i,j,edge in edgotype_x.edges(data=True):
    db = edge["db_ensembl_gene_id_mt"]
    mut = edge["Substitution"]
    ad = edge["ad_ensembl_gene_id_mt"]
    fi = f"/data/dzeiberg/ppi/y2hEdgotyping/protAlbertEmbeddings/{db}_{mut}.pt"
    fj = f"/data/dzeiberg/ppi/y2hEdgotyping/protAlbertEmbeddings/{ad}.pt"
    if os.path.isfile(fi) and os.path.isfile(fj):
        files.append((fi,fj))
        scores.append(calc_score(edge))
        labels.append(calc_label(edge))
    else:
        print(fi,os.path.isfile(fi),fj,os.path.isfile(fj))

In [None]:
def loadInst(fi,fj):
    Xi = torch.load(fi).mean(0)
    Xj = torch.load(fj).mean(0)
    return np.multiply(Xi,Xj)

In [None]:
X = []
for (fi,fj) in tqdm(files):
    X.append(loadInst(fi,fj))

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, LinearRegression,LogisticRegression
from sklearn.svm import SVC

In [None]:
XMat = np.stack(X)

In [None]:
XMat.shape

In [None]:
labels = np.array(labels)

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
fig,ax = plt.subplots(4,5,figsize=(12,8),sharex=True)
valIndices = []
valPreds = []
aucs = []
for i,(trainInds, valInds) in enumerate(KFold().split(XMat,labels)):
    XTrain = XMat[trainInds]
    yTrain = labels[trainInds]
    tmask = ~np.isnan(yTrain)
    XTrain = XTrain[tmask]
    yTrain = yTrain[tmask]
    XVal = XMat[valInds]
    yVal = labels[valInds]
    vmask = ~np.isnan(yVal)
    XVal,yVal = XVal[vmask],yVal[vmask]
#     model = Ridge()
    model = LogisticRegression(C=.1,max_iter=1000)
#     model = SVC(probability=True)
    model.fit(XTrain,yTrain)
#     yHat = model.predict(XVal)
    yHat = model.predict_proba(XVal)[:,1]
#     print(np.mean(np.abs(yVal.ravel() - yHat.ravel())))
    aucs.append(roc_auc_score(yVal.ravel(),yHat.ravel()))
    print(aucs[-1])
    valIndices.append(valInds)
    valPreds.append(yHat)
    yTHat = model.predict_proba(XTrain)[:,1]
    ax[0,i].hist(yTHat[yTrain])
    ax[1,i].hist(yTHat[~yTrain])
    ax[2,i].hist(yHat[yVal])
    ax[3,i].hist(yHat[~yVal])
print(np.mean(aucs))