In [None]:
# from Bio import Entrez

# Entrez.email = "zeiberg.d@northeastern.edu"

import pandas as pd

from easydict import EasyDict

In [None]:
from tqdm.notebook import tqdm

import numpy as np
import pickle

In [None]:
def getSequenceFromNPID(npid):
    "Return the protein sequence from "
    handle = Entrez.efetch(db="protein",id=npid, rettype="fasta", retmode="text")
    lines = handle.readlines()
    lines = [l.strip() for l in lines]
    return "".join(lines[1:])

In [None]:
# df = pd.read_csv("/ssdata/hgmd/HGMD_PRO_2019_1_hg19.vcf",delimiter="\t",header=14)
df = pd.read_csv("/data/projects/processBio/hgmd/HGMD_PRO_2019_1_hg19.vcf",delimiter="\t",header=14)

In [None]:
df

In [None]:
def getInfo(row):
    d = EasyDict()
    for vals in row["INFO"].split(";"):
        k,v = vals.split("=")
        d[k] = v
    if "PROT" in d:
        d.pid, d.variant = d.PROT.split(":")
    elif "DB" in d:
        d.pid = ""
        d.variant=""
        rsid= d.DB.replace("rs","")
        
    else:
        return d
    try:
        _,d.variant = d.variant.split(".")
        d.reference,location,d.missense = d.variant[0],int(d.variant[1:-1]),d.variant[-1]
    except ValueError:
        return d
    d.loc = location - 1
    return d

# Original Stats

In [None]:
allGenes = set()
genesWithAtleastOneVariant = set()
validInfo = []
for i,row in tqdm(df.iterrows()):
    info = getInfo(row)
    allGenes.add(info.GENE)
    if "missense" in info:
        genesWithAtleastOneVariant.add(info.GENE)
        validInfo.append(info)

# All missense variants

In [None]:
variantDF = pd.DataFrame(validInfo)

In [None]:
variantDF

Number of genes with at least one variant

In [None]:
variantDF.GENE.unique().shape

In [None]:
sequences = EasyDict()

In [None]:
for npid in tqdm(protein_ids):
    sequences[npid] = getSequenceFromNPID(npid)

In [None]:
pickle.dump(sequences, open("/ssdata/hgmd/sequences.pkl","wb"))

In [None]:
import pickle
from easydict import EasyDict

In [None]:
# sequences = pickle.load(open("/ssdata/hgmd/sequences.pkl","rb"))
sequences = pickle.load(open("/data/projects/processBio/hgmd/sequences.pkl","rb"))

In [None]:
seqs = []
errors = {"match":[]}
for idx,row in tqdm(variantDF.iterrows(),total=variantDF.shape[0]):
    s = sequences[row.pid]
    loc = row["loc"]
    variant = s[:loc] + row.missense + s[loc+1:]
    if loc < len(s) and s[loc] == row.reference:
        seqs.append(variant)
    else:
        errors["match"].append(idx)
        seqs.append(np.nan)

In [None]:
variantDF = variantDF.assign(seq=seqs)

In [None]:
variantDF.shape

pedja says DM only

In [None]:
variantDF = variantDF[(variantDF.CLASS == "DM") & (~variantDF.PROT.str.contains("\*"))]

In [None]:
variantDF

In [None]:
variantDF.to_csv("/data/projects/processBio/hgmd/variant_df.csv")

In [None]:
variantDF

In [None]:
import torch
import esm
import os
import torch.nn as nn
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']='1,2,3'


model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
batch_converter = alphabet.get_batch_converter()

# model = nn.DataParallel(model.cuda())

In [None]:
model = model.to("cuda:1")

In [None]:
WINDOW_SIZE = 500

Data = list(variantDF.apply(lambda row: row.seq[max(0,
                                                   row["loc"] - WINDOW_SIZE) : min(len(row.seq),
                                                                                              row["loc"] + WINDOW_SIZE + 1)],axis=1).items())

In [None]:
Data[0]

In [None]:
from tqdm.notebook import trange

In [None]:
BATCHSIZE=1
representations = []
for start in trange(0,len(Data),BATCHSIZE):
    batch_labels, batch_strs, batch_tokens = batch_converter(Data[start : start + BATCHSIZE])
    # Extract per-residue representations (on CPU)
    with torch.no_grad():
        results = model(batch_tokens.to("cuda:1"), repr_layers=[33], return_contacts=True)
    token_representations = results["representations"][33].cpu()
    del results, batch_labels, batch_strs, batch_tokens
    representations.append(token_representations[0,1:-1].cpu().numpy())
    del token_representations

In [None]:
variantDF = variantDF.assign(representation=representations)

In [None]:
variantDF

In [None]:
from processBioDBs.utilities import prepSeq

In [None]:
variantDF = variantDF.assign(xi=variantDF.apply(lambda row: prepSeq(row.representation,
                                                                 row["loc"],
                                                                 originalWindowSize=WINDOW_SIZE),axis=1))

In [None]:
X = np.stack(variantDF.xi.values)

In [None]:
np.save("/data/projects/processBio/hgmd/X.npy",X)

In [None]:
X.shape, variantDF.shape