In [None]:
aaTable = {"Ala":"A",
           "Arg": "R",
           "Asn": "N",
           "Asp": "D",
           "Cys": "C",
           "Gln": "Q",
           "Glu": "E",
           "Gly": "G",
           "His": "H",
           "Ile": "I",
           "Leu": "L",
           "Lys": "K",
           "Met": "M",
           "Phe": "F",
           "Pro": "P",
           "Ser": "S",
           "Thr": "T",
           "Trp": "W",
           "Tyr": "Y",
           "Val": "V"}

In [None]:
import pandas as pd
import torch
import esm
from tqdm.notebook import tqdm

In [None]:
from processBioDBs.utilities import getSequence

In [None]:
# Load ESM-1b model
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
batch_converter = alphabet.get_batch_converter()


In [None]:
model = model.to("cuda:1")

In [None]:
def getSequenceRepresentation(Data):
    batch_labels, batch_strs, batch_tokens = batch_converter(Data)
    batch_tokens = batch_tokens.to("cuda:1")
    # Extract per-residue representations (on CPU)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33], return_contacts=True)
    token_representations = results["representations"][33].cpu()
    del results, batch_labels, batch_strs, batch_tokens
    # Generate per-sequence representations via averaging
    # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
    sequence_representations = []
    for i, (_, seq) in enumerate(Data):
        sequence_representations.append(token_representations[i, 1 : len(seq) + 1].cpu().numpy())
    del token_representations
    return sequence_representations

In [None]:
def prepSeq(s,loc,windowSize=510):
    sPrime = s[max(0, loc-windowSize) : min(len(s), loc + windowSize)]
    return sPrime

# PTEN

In [None]:
pten = pd.read_csv("/data/projects/processBio/ycaData/parse_pten.csv")

pten=pten.assign(variant=pten.variant.apply(lambda s: (aaTable[s[:3]], int(s[3:-3]) - 1, aaTable[s[-3:]])))

In [None]:
ptenSeq = getSequence("pten")

In [None]:
variants = []
for idx,row in pten.iterrows():
    og,loc,var = row["variant"]
    if ptenSeq[loc] != og:
        variants.append("")
        continue
    variants.append(ptenSeq[:loc] + var + ptenSeq[loc+1:])

In [None]:
pten = pten.assign(varSeqe=variants)

In [None]:
representations = []
batchSize=1
for idx,row in tqdm(pten.iterrows(),total=pten.shape[0]):
    rep = getSequenceRepresentation([(idx, prepSeq(row.varSeqe, row.variant[1]))])
    representations.append(rep)
#     try:
#         rep = getSequenceRepresentation([(idx,prepSeq(row.varSeqe, row.variant[1]))])
#         representations.append(rep)
#     except:
#         representations.append([])

In [None]:
pten = pten.assign(representation=representations)

In [None]:
pten.to_pickle("/data/projects/processBio/ycaData/parse_pten.pkl")

# CALM1

In [None]:
calm = pd.read_csv("/data/projects/processBio/ycaData/parse_calm1.csv")

calm=calm.assign(variant=calm.variant.apply(lambda s: (aaTable[s[:3]], int(s[3:-3]) - 1, aaTable[s[-3:]])))

In [None]:
calmSeq = getSequence("calm1")

In [None]:
variants = []
for idx,row in calm.iterrows():
    og,loc,var = row["variant"]
    if calmSeq[loc] != og:
        variants.append("")
        continue
    variants.append(calmSeq[:loc] + var + calmSeq[loc+1:])

In [None]:
calm = calm.assign(varSeqe=variants)

In [None]:
representations = []
batchSize=1
for idx,row in tqdm(calm.iterrows(),total=calm.shape[0]):
    rep = getSequenceRepresentation([(idx, prepSeq(row.varSeqe, row.variant[1]))])
    representations.append(rep)


In [None]:
calm = calm.assign(representation=representations)

In [None]:
calm.to_pickle("/data/projects/processBio/ycaData/parse_calm1.pkl")

# SUMO

In [None]:
sumo = pd.read_csv("/data/projects/processBio/ycaData/parse_sumo.csv")
sumo = sumo.assign(variant=sumo.variant.apply(lambda s: (aaTable[s[:3]], int(s[3:-3]) - 1, aaTable[s[-3:]])))

In [None]:
sumoSeq = getSequence("sumo1")

In [None]:
variants = []
for idx,row in sumo.iterrows():
    og,loc,var = row["variant"]
    if sumoSeq[loc] != og:
        variants.append("")
        continue
    variants.append(sumoSeq[:loc] + var + sumoSeq[loc+1:])

In [None]:
sumo = sumo.assign(varSeqe=variants)

In [None]:
representations = []
batchSize=1
for idx,row in tqdm(sumo.iterrows(),total=sumo.shape[0]):
    rep = getSequenceRepresentation([(idx, prepSeq(row.varSeqe, row.variant[1]))])
    representations.append(rep)



In [None]:
sumo = sumo.assign(representation=representations)

In [None]:
sumo.to_pickle("/data/projects/processBio/ycaData/parse_sumo.pkl")

# tpmt

In [None]:
tpmt = pd.read_csv("/data/projects/processBio/ycaData/parse_tpmt.csv")
tpmt = tpmt.assign(variant=tpmt.variant.apply(lambda s: (aaTable[s[:3]], int(s[3:-3]) - 1, aaTable[s[-3:]])))

In [None]:
tpmtSeq = getSequence("tpmt")

In [None]:
variants = []
for idx,row in tpmt.iterrows():
    og,loc,var = row["variant"]
    if tpmtSeq[loc] != og:
        variants.append("")
        continue
    variants.append(tpmtSeq[:loc] + var + tpmtSeq[loc+1:])

In [None]:
tpmt = tpmt.assign(varSeqe=variants)

In [None]:
representations = []
batchSize=1
for idx,row in tqdm(tpmt.iterrows(),total=tpmt.shape[0]):
    rep = getSequenceRepresentation([(idx, prepSeq(row.varSeqe, row.variant[1]))])
    representations.append(rep)




In [None]:
tpmt = tpmt.assign(representation=representations)

In [None]:
tpmt.to_pickle("/data/projects/processBio/ycaData/parse_tpmt.pkl")

In [None]:
def getRep(rep,L,originalWindow=510, W=3):
    i,j = max(0, L-originalWindow),min(len(rep), L + originalWindow)
    k,m = max(0,L - W - i), min(len(rep),L + W - i + 1)
    return rep[k : m]

In [None]:
tpmt.head()

In [None]:
import numpy as np

In [None]:
def prepSeq(row, windowSizes=[2,4,8,16,32,64,128,256]):
    rep = np.array(row.representation[0])
    L = row.variant[1]
    reps = [getRep(rep,L, originalWindow=510,W=w).mean(0) for w in windowSizes]
    xi = np.concatenate(reps)
    return xi

In [None]:
tpmt = tpmt.assign(xi=tpmt.apply(lambda row: prepSeq(row),axis=1))

tpmt.to_pickle("/data/projects/processBio/ycaData/parse_tpmt.pkl")

In [None]:
sumo = sumo.assign(xi=sumo.apply(lambda row: prepSeq(row),axis=1))

sumo.to_pickle("/data/projects/processBio/ycaData/parse_sumo.pkl")

In [None]:
calm = calm.assign(xi=calm.apply(lambda row: prepSeq(row),axis=1))

calm.to_pickle("/data/projects/processBio/ycaData/parse_calm1.pkl")

In [None]:
pten = pten.assign(xi=pten.apply(lambda row: prepSeq(row),axis=1))

pten.to_pickle("/data/projects/processBio/ycaData/parse_pten.pkl")

In [None]:
pten = pd.read_pickle("/data/projects/processBio/ycaData/parse_pten.pkl")

# GnomAD Data

In [None]:
from glob import glob

In [None]:
gnomadFiles = glob("/data/projects/processBio/gnomad/yca/*.csv")

In [None]:
gnomadDFs = []

In [None]:
def makeVariedSequence(row,seq):
    og,loc,var= row.variant
    if loc == -1 or seq[loc] != og:
        return ""
    return seq[:loc] + var + seq[loc+1:]

In [None]:
def parseConsequence(c):
    try:
        og = aaTable[c[2:5]]
    except KeyError:
        return ("X",-1,"X")
    try:
        loc = int(c[5:-3]) - 1
    except ValueError:
        return ("X",-1,"X")
    try:
        var = aaTable[c[-3:]]
    except KeyError:
        return ("X",-1,"X")
    return (og,loc,var)

In [None]:
for file in gnomadFiles:
    df = pd.read_csv(file)
    geneSymbol = file[file.rfind("/") + 1 : -4]
    seq = getSequence(geneSymbol)
    df = df.assign(variant=df["HGVS Consequence"].apply(lambda s: parseConsequence(s)))
    df = df.assign(seq = df.apply(lambda row: makeVariedSequence(row,seq),axis=1))
    df = df.assign(embedding=df.seq.apply(lambda s: getSequenceRepresentation([(-1, s)])))
    gnomadDFs.append(df)

In [None]:
gnomadDFs[0]

In [None]:
for name, df in zip(gnomadFiles, gnomadDFs):
    df.to_pickle(name.replace(".csv",".pkl"))

# Generate vectors for gnomadFiles

In [None]:
gnomadDFs = []
gnomadFileNames = []
for file in glob("/data/projects/processBio/gnomad/yca/*.pkl"):
    gnomadDFs.append(pd.read_pickle(file))
    gnomadFileNames.append(file)

In [None]:
from processBioDBs.utilities import prepSeq as convertEmbeddingMatrixToVector

In [None]:
import numpy as np

In [None]:
for i, dfi in enumerate(gnomadDFs):
    gnomadDFs[i] = dfi.assign(xi=dfi.apply(lambda row: convertEmbeddingMatrixToVector(row["embedding"][0],
                                                                                     row.variant[1]),axis=1))

In [None]:
gnomadFileNames[0]

In [None]:
gnomadDFs[0]

In [None]:
for dfi,fn in zip(gnomadDFs, gnomadFileNames):
    dfi.to_pickle(fn)