In [103]:
from Bio import Entrez

Entrez.email = "zeiberg.d@northeastern.edu"

import pandas as pd

from easydict import EasyDict

In [104]:
from tqdm.notebook import tqdm

import numpy as np
import pickle

In [117]:
def getSequenceFromNPID(npid):
    "Return the protein sequence from "
    handle = Entrez.efetch(db="protein",id=npid, rettype="fasta", retmode="text")
    lines = handle.readlines()
    lines = [l.strip() for l in lines]
    return "".join(lines[1:])

In [4]:
df = pd.read_csv("/ssdata/hgmd/HGMD_PRO_2019_1_hg19.vcf",delimiter="\t",header=14)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,1,865595,CM1613956,A,G,.,.,CLASS=DM?;MUT=ALT;GENE=SAMD11;STRAND=+;DNA=NM_...
1,1,874491,CM1613954,C,T,.,.,CLASS=DM?;MUT=ALT;GENE=SAMD11;STRAND=+;DNA=NM_...
2,1,877523,CM1511864,C,G,.,.,CLASS=DM?;MUT=ALT;GENE=SAMD11;STRAND=+;DNA=NM_...
3,1,879286,CS1613955,A,C,.,.,CLASS=DM?;MUT=ALT;GENE=SAMD11;STRAND=+;DNA=NM_...
4,1,879375,CM1613953,C,T,.,.,CLASS=DM;MUT=ALT;GENE=SAMD11;STRAND=+;DNA=NM_1...
...,...,...,...,...,...,...,...,...
229156,Y,2655774,CD056857,CCCT,C,.,.,CLASS=DM;MUT=ALT;GENE=SRY;STRAND=-;DNA=NM_0031...
229157,Y,6931938,CM121018,G,C,.,.,CLASS=DM?;MUT=ALT;GENE=TBL1Y;STRAND=+;DNA=NM_0...
229158,Y,6938305,CM121019,C,T,.,.,CLASS=DM?;MUT=ALT;GENE=TBL1Y;STRAND=+;DNA=NM_0...
229159,Y,14847658,CD993525,TTAAG,T,.,.,CLASS=DM;MUT=ALT;GENE=USP9Y;STRAND=+;DNA=NM_00...


In [6]:
def getInfo(row):
    d = EasyDict()
    for vals in row["INFO"].split(";"):
        k,v = vals.split("=")
        d[k] = v
    if "PROT" in d:
        d.pid, d.variant = d.PROT.split(":")
    elif "DB" in d:
        d.pid = ""
        d.variant=""
        rsid= d.DB.replace("rs","")
        
    else:
        return d
    try:
        _,d.variant = d.variant.split(".")
        d.reference,location,d.missense = d.variant[0],int(d.variant[1:-1]),d.variant[-1]
    except ValueError:
        return d
    d.loc = location - 1
    return d

# Original Stats

In [11]:
allGenes = set()
genesWithAtleastOneVariant = set()
validInfo = []
for i,row in tqdm(df.iterrows()):
    info = getInfo(row)
    allGenes.add(info.GENE)
    if "missense" in info:
        genesWithAtleastOneVariant.add(info.GENE)
        validInfo.append(info)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [12]:
variantDF = pd.DataFrame(validInfo)

In [13]:
variantDF

Unnamed: 0,CLASS,MUT,GENE,STRAND,DNA,PROT,DB,PHEN,RANKSCORE,pid,variant,reference,missense,loc
0,DM?,ALT,SAMD11,+,NM_152486.2:c.133A>G,NP_689699.2:p.K45E,rs903331232,"""Retinitis_pigmentosa""",0.21,NP_689699.2,K45E,K,E,44
1,DM?,ALT,SAMD11,+,NM_152486.2:c.502C>T,NP_689699.2:p.R168*,rs1441881282,"""Retinitis_pigmentosa""",0.99,NP_689699.2,R168*,R,*,167
2,DM?,ALT,SAMD11,+,NM_152486.2:c.877C>G,NP_689699.2:p.P293A,rs200195897,"""Autism_spectrum_disorder""",0.1,NP_689699.2,P293A,P,A,292
3,DM,ALT,SAMD11,+,NM_152486.2:c.1888C>T,NP_689699.2:p.R630*,rs761448939,"""Retinitis_pigmentosa""",0.57,NP_689699.2,R630*,R,*,629
4,DM,ALT,ISG15,+,NM_005101.3:c.163C>T,NP_005092.1:p.Q55*,rs786201005,"""Idiopathic_basal_ganglia_calcification""",0.99,NP_005092.1,Q55*,Q,*,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146445,DM,ALT,SRY,-,NM_003140.2:c.8C>T,NP_003131.1:p.S3L,,"""Gonadal_dysgenesis""",0.78,NP_003131.1,S3L,S,L,2
146446,DM,ALT,SRY,-,NM_003140.2:c.4C>T,NP_003131.1:p.Q2*,rs104894977,"""XY_sex_reversal""",0.93,NP_003131.1,Q2*,Q,*,1
146447,DM?,ALT,TBL1Y,+,NM_033284.1:c.205G>C,NP_150600.1:p.D69H,rs1297787063,"""Coarctation_of_the_aorta_non-syndromic""",0.32,NP_150600.1,D69H,D,H,68
146448,DM?,ALT,TBL1Y,+,NM_033284.1:c.526C>T,NP_150600.1:p.R176W,rs766714719,"""Coarctation_of_the_aorta_non-syndromic""",0.18,NP_150600.1,R176W,R,W,175


Number of genes with at least one variant

In [14]:
variantDF.pid.unique().shape

(9234,)

In [15]:
sequences = EasyDict()

In [36]:
for npid in tqdm(protein_ids):
    sequences[npid] = getSequenceFromNPID(npid)

HBox(children=(FloatProgress(value=0.0, max=9234.0), HTML(value='')))




In [47]:
pickle.dump(sequences, open("/ssdata/hgmd/sequences.pkl","wb"))

In [8]:
import pickle
from easydict import EasyDict

In [19]:
sequences = pickle.load(open("/ssdata/hgmd/sequences.pkl","rb"))

In [150]:
seqs = []
errors = {"match":[]}
for idx,row in tqdm(variantDF.iterrows(),total=variantDF.shape[0]):
    s = sequences[row.pid]
    loc = row["loc"]
    variant = s[:loc] + row.missense + s[loc+1:]
    if loc < len(s) and s[loc] == row.reference:
        seqs.append(variant)
    else:
        errors["match"].append(idx)
        seqs.append(np.nan)

HBox(children=(FloatProgress(value=0.0, max=146450.0), HTML(value='')))




In [152]:
variantDF = variantDF.assign(seq=seqs)

In [163]:
variantDF = variantDF[(variantDF.CLASS != "DM?") & (~variantDF.PROT.str.contains("\*"))]

In [168]:
variantDF

Unnamed: 0,CLASS,MUT,GENE,STRAND,DNA,PROT,DB,PHEN,RANKSCORE,pid,variant,reference,missense,loc,seq
6,DM,ALT,AGRN,+,NM_198576.3:c.226G>A,NP_940978.2:p.G76S,rs756623659,"""Congenital_myasthenic_syndrome_with_distal_mu...",0.15,NP_940978.2,G76S,G,S,75,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...
7,DM,ALT,AGRN,+,NM_198576.3:c.314A>T,NP_940978.2:p.N105I,rs879253787,"""Congenital_myasthenic_syndrome_with_distal_mu...",0.91,NP_940978.2,N105I,N,I,104,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...
9,DP,ALT,AGRN,+,NM_198576.3:c.1123G>T,NP_940978.2:p.A375S,rs138031468,"""Ovarian_cancer_epithelial_reduced_risk""",0.12000000000000001,NP_940978.2,A375S,A,S,374,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...
11,DP,ALT,AGRN,+,NM_198576.3:c.1528G>A,NP_940978.2:p.G510S,rs138288952,"""Inflammatory_bowel_disease_association_with""",0.24,NP_940978.2,G510S,G,S,509,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...
12,DM,ALT,AGRN,+,NM_198576.3:c.3527T>C,NP_940978.2:p.L1176P,,"""Congenital_myasthenic_syndrome""",0.72,NP_940978.2,L1176P,L,P,1175,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146440,DM,ALT,SRY,-,NM_003140.2:c.113A>G,NP_003131.1:p.E38G,,"""XY_sex_reversal""",0.24,NP_003131.1,E38G,E,G,37,MQSYASAMLSVFNSDDYSPAVQENIPALRRSSSFLCTGSCNSKYQC...
146441,DM,ALT,SRY,-,NM_003140.2:c.89G>T,NP_003131.1:p.R30I,,"""Gonadal_dysgenesis""",0.63,NP_003131.1,R30I,R,I,29,MQSYASAMLSVFNSDDYSPAVQENIPALRISSSFLCTESCNSKYQC...
146443,DM,ALT,SRY,-,NM_003140.2:c.53G>A,NP_003131.1:p.S18N,rs104894971,"""XY_sex_reversal""",0.1,NP_003131.1,S18N,S,N,17,MQSYASAMLSVFNSDDYNPAVQENIPALRRSSSFLCTESCNSKYQC...
146445,DM,ALT,SRY,-,NM_003140.2:c.8C>T,NP_003131.1:p.S3L,,"""Gonadal_dysgenesis""",0.78,NP_003131.1,S3L,S,L,2,MQLYASAMLSVFNSDDYSPAVQENIPALRRSSSFLCTESCNSKYQC...


In [167]:
np.unique(variantDF.seq.isnull(),return_counts=True)

(array([False,  True]), array([88926,   309]))