In [None]:
from Bio import Entrez

import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

In [None]:
Entrez.email = "zeiberg.d@northeastern.edu"

# Example 1

In [None]:
def getTree(RCVID):
    handle = Entrez.efetch(db="clinvar",id=RCVID, rettype="clinvarset", retmode="text")
    lines = handle.read()

    return ET.fromstring(lines)

In [None]:
def getClinSig(tree):
    return tree.find("ClinVarSet").find("ReferenceClinVarAssertion").find("ClinicalSignificance").find("Description").text

In [None]:
def getVariants(tree):
    variants = [e.find("Attribute") for e in tree.find("ClinVarSet").find("ReferenceClinVarAssertion").find("MeasureSet").find("Measure").findall("AttributeSet") if e.find("Attribute").attrib["Type"] == "HGVS, protein, RefSeq"]
    return [v.attrib for v in variants]

In [None]:
tree1 = getTree("RCV000000036")

In [None]:
getClinSig(tree1)

In [None]:
variants = getVariants(tree1)

In [None]:
variants

# Example 2

In [None]:
tree2 = getTree("RCV000000077")

In [None]:
getClinSig(tree2)

In [None]:
getVariants(tree2)

# Example 3

In [None]:
tree3 = getTree("RCV000000206")

In [None]:
getClinSig(tree3)

In [None]:
getVariants(tree3)

# Lets get it

In [None]:
summary = pd.read_csv("/data/projects/processBio/clinvar/clinvar/variant_summary.txt",delimiter="\t")

In [None]:
summary = summary[(summary.Type == "single nucleotide variant") & (summary.ReviewStatus.isin(["criteria provided, single submitter",
              "criteria provided, multiple submitters, no conflicts",
              "reviewed by expert panel",
              "practice guideline",
              ])) & (summary.Assembly == "GRCh38")]


In [None]:
summary = summary.assign(nmid=summary.Name.apply(lambda n: n[:n.find("(")]))

In [None]:
summary

In [None]:
clinvar = summary[(summary.ClinicalSignificance.isin(["Likely benign",
                                                                     "Benign", 
                                                                     "Pathogenic",
                                                                     "Likely pathogenic",
                                                                     "Benign/Likely benign",
                                                                     "Pathogenic/Likely pathogenic"]))]

In [None]:
clinvar

In [None]:
def extractVariantFromClinvar(n):
    if n.rfind("(p.") != -1:
        variant = n[n.rfind("(p.") + 3 : -1]
#         print(variant)
        og,loc,var = variant[:3], variant[3:-3], variant[-3:]
        try:
            loc = int(loc) -1
        except ValueError:
            return np.nan, np.nan,np.nan
        if variant[-1] == "=":
#             print("variant is =")
            return np.nan, np.nan,np.nan
        return og,loc,var
    return np.nan, np.nan, np.nan

In [None]:
clinvar = clinvar.assign(variantInfo=clinvar.Name.apply(lambda n: extractVariantFromClinvar(n)))

In [None]:
outliers = clinvar[clinvar.variantInfo.apply(lambda t: pd.isna(t[0]))]

In [None]:
accessions = outliers.RCVaccession.apply(lambda s: s.split("|")[0])

In [None]:
accessions

In [None]:
from tqdm.notebook import tqdm

In [None]:
trees = {}
for a in tqdm(accessions):
    try:
        trees[a] = getTree(a)
    except Entrez.HTTPError:
        trees[a] = np.nan

In [None]:
len(trees)

In [None]:
import pickle

In [None]:
pickle.dump(trees, open("/data/projects/processBio/clinvar/clinvar/outlierTrees.pkl","wb"))

In [None]:
treeVariants = {}
for rcv, t in tqdm(trees.items()):
    if not pd.isna(t):
        treeVariants[rcv] = getVariants(t)

In [None]:
treeConsequences = {}
for rcv, t in tqdm(trees.items()):
    if not pd.isna(t):
        treeConsequences[rcv] = getClinSig(t)

In [None]:
treeVariants["RCV000000036"]

In [None]:
for v,c in zip(*np.unique(list(treeConsequences.values()),return_counts=True)):
    print(v,"\t", c)

In [None]:
from Bio import Entrez
Entrez.email = 'zeiberg.d2@northeastern.edu'

def getSequenceFromNPID(npid):
    "Return the protein sequence from "
    handle = Entrez.efetch(db="protein",id=npid, rettype="fasta", retmode="text")
    lines = handle.readlines()
    lines = [l.strip() for l in lines]
    return "".join(lines[1:])

npid2seq = pickle.load(open("/data/projects/processBio/ncbi/npid2seq.pkl","rb"))

In [None]:
aaTable = {"Ala":"A",
           "Arg": "R",
           "Asn": "N",
           "Asp": "D",
           "Cys": "C",
           "Gln": "Q",
           "Glu": "E",
           "Gly": "G",
           "His": "H",
           "Ile": "I",
           "Leu": "L",
           "Lys": "K",
           "Met": "M",
           "Phe": "F",
           "Pro": "P",
           "Ser": "S",
           "Thr": "T",
           "Trp": "W",
           "Tyr": "Y",
           "Val": "V"}

In [None]:
tableDicts = []
mismatchCount = 0
for rcv in tqdm(set(treeConsequences.keys()).intersection(set(treeVariants.keys()))):
    consequence = treeConsequences[rcv]
    variants = treeVariants[rcv]
    if consequence in ["Benign",
                       "Benign/Likely benign",
                       "Likely benign"]:
        label = 0
    elif consequence in ["Likely pathogenic",
                         "Pathogenic",
                         "Pathogenic/Likely pathogenic"]:
        label = 1
    else:
        print("invalid label")
        continue
    for variant in variants:
        npid = variant["Accession"]
        if npid in npid2seq:
            seq = npid2seq[npid]
        else:
            seq = getSequenceFromNPID(npid)
            npid2seq[npid] = seq
        if "Change" not in variant:
            print("can't find change")
            continue
        varstr = variant["Change"]
        og,loc,var = varstr[2:5], varstr[5:-3], varstr[-3:]
        try:
            loc = int(loc) - 1
        except ValueError:
            print("can't parse location", varstr)
            continue
        if og not in aaTable or var not in aaTable:
            print("invalid AAs", varstr)
            continue
        if len(seq) < loc or seq[loc] != var:
            print("~~~~~~~~ mismatch", varstr)
            mismatchCount += 1
            continue
        tableDicts.append({
            "npid":npid,
            "rcv" : rcv,
            "label": label,
            "seq" : seq[:int(loc)] + var + seq[int(loc)+1:]
        })



In [None]:
mismatchCount

In [None]:
tableDicts