In [None]:
from Bio import Entrez

Entrez.email = "zeiberg.d@northeastern.edu"

import pandas as pd

from easydict import EasyDict

In [None]:
from tqdm.notebook import tqdm

In [None]:
import numpy as np
import pickle

In [None]:
import matlab.engine
import matlab
eng = matlab.engine.start_matlab()
path = eng.addpath("~/mutpred2/all_functions/")

In [None]:
def getSequenceFromNPID(npid):
    "Return the protein sequence from "
    handle = Entrez.efetch(db="protein",id="NP_689699", rettype="fasta", retmode="text")
    lines = handle.readlines()
    lines = [l.strip() for l in lines]
    return "".join(lines[1:])

In [None]:
df = pd.read_csv("/data/common/hgmd/HGMD_PRO_2019.1_hg19.vcf",delimiter="\t",header=14)

In [None]:
df

In [None]:
def getInfo(row):
    d = EasyDict()
    for vals in row["INFO"].split(";"):
        k,v = vals.split("=")
        d[k] = v
    pid, variant = d.PROT.split(":")
    pid,_ = pid.split(".")
    _,variant = variant.split(".")
    reference,location,missense = variant[0],int(variant[1:-1]),variant[-1]
    d.pid = pid
    d.reference = reference
    d.loc = location - 1
    d.missense = missense
    return d

In [None]:
 getInfo(df.loc[0])

In [None]:
def addSeqToInfo(info):
    info.seq = getSequenceFromNPID(info.pid)
    return info

In [None]:
protein_ids = set()
skipped = []
for i,r in tqdm(df.iterrows(),total=df.shape[0]):
    try:
        info = getInfo(r)
        protein_ids.add(info.pid)
    except:
        skipped.append(i)

In [None]:
proteins = []
for pid in tqdm(protein_ids,total=len(protein_ids)):
    d = EasyDict
    d.pid = pid
    d.seq=getSequenceFromNPID(pid)
    pssm = np.array(eng.get_pssm(d.seq))
    if not np.all(pssm == 0):
        d.pssm = pickle.dumps(pssm)
    else:
        d.pssm = pickle.dumps(np.nan)
    proteins.append(d)