In [None]:
import pickle
import numpy as np
import pandas as pd

In [None]:
embeddings = pickle.load(open("/data/projects/processBio/clinvar/clinvar/parseClinvarRepresentations.pkl","rb"))

In [None]:
df = pd.read_csv("/data/projects/processBio/clinvar/clinvar/parseClinvarWithVariant.csv")

In [None]:
# export utilities
def getRep(rep,L,originalWindow=510, W=3):
    """
    Extract a window of the matrix representation rep of size W around a mutation at location L
    
    Arguments:
    - rep : embedding matrix
    - L : The index of the variant about which windows will be taken
    - originalWindow : The size of the window used to generate the subset of the protein sequence that is embedded
    - W : the desired size of the window to be taken from embedding around the variant
    
    Example:
    variant at position 3
    what the function is given is 1 2 3 4 5
    Given these two you want to return a window of size 1 around position 3 : [2 3 4]
    
    Original Sequence               : 0 1 2 3 4 5 6
    Variant Position                :       L
    subset passed to model          :  [1 2 3 4 5]
                                        I       J
    pass to model a window of size 1:    [2 3 4]
    
                                          K   M
    
    
    Example: 
    
    L = 2
    Original Window size used for embedding: 3
    Desired Window size : 1
    
    Original Sequence : 0 1 2 3 4 5 6
                        I   L     J
                          K   M
                            
    """
    i,j = max(0, L-originalWindow),min(len(rep), L + originalWindow)
    k,m = max(0,L - W - i), min(len(rep),L + W - i + 1)
    assert k >= 0 and m <= len(rep)

    return rep[k : m]

# Testing getRep

In [None]:
Below, 

0[1 2 3 4 5]6
  I   L   J
    K   M

In [None]:
xi = np.array([1,2,3,4])

In [None]:
getRep(np.arange(1,6), 3, W=1,originalWindow=2)

0 1 2 3 4 5 6
I   L     J
  K   M  

In [None]:
getRep(np.arange(0,5), 3, W=2,originalWindow=2)

0[1 2 3 4 5 6]
  I     L   J
    K       M

In [None]:
getRep(np.arange(1,7),4, originalWindow=3,W=2)

In [None]:
# export utilities


def prepSeq(representationMatrix, locationOfVariant, originalWindowSize=510, windowSizes=2**np.arange(1,9)):
    windowVectors= []
    for w in windowSizes:
        # get the windowed subset of the matrix and take the mean across all character positions
        r = getRep(representationMatrix, locationOfVariant, originalWindow=originalWindowSize, W=w).mean(0)
        windowVectors.append(r)
    xi = np.concatenate(windowVectors)
    return xi

In [None]:

# def prepSeq(idx,dfi=df,windowSizes=[2,4,8,16,32,64,128,256]):
#     rep = embeddings[idx][0]
#     L = int(dfi.loc[idx,"variant"][3:-3]) - 1
#     reps = []
#     for w in windowSizes:
#         r = getRep(rep,L, originalWindow=510,W=w).mean(0)
#         if np.any(np.isnan(r)):
#             print(w,L, len(rep))
#         reps.append(r)
#     xi = np.concatenate(reps)
#     return xi

In [None]:
from tqdm.notebook import tqdm

In [None]:
len(df.loc[5290].variantSeq), df.loc[5290].variant

In [None]:
Xs = []
Ys = []
for idx in tqdm(df.index,total=df.shape[0]):
    L = int(df.loc[idx].variant[3:-3]) - 1
    if len(embeddings[idx]) > 0 and len(embeddings[idx][0]) >= 1 and L < len(df.loc[idx].variantSeq):

        xi = prepSeq(embeddings[idx][0], int(df.loc[idx,"variant"][3:-3]) - 1)
        if np.any(np.isnan(xi)):
            print(idx)
        Xs.append(xi)
        Ys.append(df.loc[idx,"label"])

In [None]:
X = np.array(Xs)

In [None]:
X.shape

In [None]:
np.any(np.isnan(X))

In [None]:
len(Ys)

In [None]:
np.save("/data/projects/processBio/clinvar/clinvar/X.npy",X)
np.save("/data/projects/processBio/clinvar/clinvar/y.npy",Ys)

In [None]:
X = np.load("/data/projects/processBio/clinvar/clinvar/X.npy",)
y = np.load("/data/projects/processBio/clinvar/clinvar/y.npy")

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
from sklearn.model_selection import train_test_split

# Random Train/Val Split

In [None]:
xtrain,xval,ytrain,yval = train_test_split(X,y,test_size=.2)

In [None]:
lr.fit(xtrain,ytrain)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(yval, lr.predict_proba(xval)[:,1])

# Per Gene CV

In [None]:
geneIDs = df.iloc[np.where([len(e) != 0 for e in embeddings])].GeneID_x

In [None]:
trainGeneIDs, valGeneIDs = train_test_split(list(set(geneIDs.values)),test_size=.2)

In [None]:
isTrain = [g in trainGeneIDs for g in geneIDs]

In [None]:
xTrain = X[isTrain]
yTrain = y[isTrain]
xVal = X[~np.array(isTrain)]
yVal = y[~np.array(isTrain)]

In [None]:
xTrain.shape, xVal.shape

In [None]:
xTrain.shape[0] + xVal.shape[0]

In [None]:
lr2 = LogisticRegression()

In [None]:
lr2.fit(xTrain,yTrain)

In [None]:
roc_auc_score(yVal, lr.predict_proba(xVal)[:,1])