In [None]:
import pandas as pd

In [None]:
summary = pd.read_csv("data/clinvar/variant_summary.txt",delimiter="\t")

In [None]:
goodStatus = ["criteria provided, single submitter",
              "criteria provided, multiple submitters, no conflicts",
              "reviewed by expert panel",
              "practice guideline",
              ]

get all variants which have the review status we are interested in, are SNVs, and the name field is in the form I'm expecting.

In [None]:
summary

In [None]:
missense = summary[(summary.ReviewStatus.isin(goodStatus)) & \
                   (summary.Type == "single nucleotide variant") & \
                   (summary.Name.str.contains("\(p.")) & \
                   (~summary.Name.str.contains("=")) & \
                   (summary.Assembly.str.contains("38"))]

In [None]:
missense

for each SNV, get the canonical sequence

In [None]:
from processBioDBs.utilities import getSequence

In [None]:
from easydict import EasyDict

In [None]:
from tqdm.notebook import tqdm

In [None]:
sequences = EasyDict()

In [None]:
for symbol in tqdm(missense.GeneSymbol.unique()):
    if symbol not in sequences:
        try:
            sequences[symbol] = getSequence(symbol)
        except:
            continue

Dump to pickle file

In [None]:
import pickle

In [None]:
with open("data/clinvar/seqs.pkl","wb") as f:
    pickle.dump(sequences,f)

reload pickle (if didn't just process)

In [None]:
sequences = pickle.load(open("data/clinvar/seqs.pkl","rb"))

1 letter to 3 letter AA conversion table

In [None]:
aatable = {"Ala":"A",
               "Arg": "R",
               "Asn": "N",
               "Asp": "D",
               "Cys": "C",
               "Gln": "Q",
               "Glu": "E",
               "Gly": "G",
               "His": "H",
               "Ile": "I",
               "Leu": "L",
               "Lys": "K",
               "Met": "M",
               "Phe": "F",
               "Pro": "P",
               "Ser": "S",
               "Thr": "T",
               "Trp": "W",
               "Tyr": "Y",
               "Val": "V"}

Get the variant from the name (expecting the form: \<original amino acid\>\<location\>\<new amino acid\>)

In [None]:
missense = missense.assign(variant=missense.Name.apply(lambda s: s[s.find("(p.") + 3:-1]))

missense = missense.assign(og=missense.variant.str.slice(0,3))
missense = missense.assign(loc=missense.variant.str.slice(3,-3))
missense = missense.assign(newAA=missense.variant.str.slice(-3,))

In [None]:
missense

drop rows where the original or new amino acid aren't valid

In [None]:
badRows = set()

In [None]:
for i,k in enumerate(missense.og):
    if k not in aatable:
         badRows.update([i])

In [None]:
for i,k in enumerate(missense.newAA):
    if k not in aatable:
        badRows.update([i])

In [None]:
missense = missense.drop(index=missense.index[list(badRows)])

make sure there are no duplicates

In [None]:
missense = missense.drop_duplicates(subset=["Name"])

In [None]:
missense

Find the rows where the canonical sequence I found match with the original amino acid given in the summary table

In [None]:
from tqdm.notebook import tqdm

In [None]:
import numpy as np

In [None]:
matches = np.zeros(missense.shape[0])
for i,(index,r) in tqdm(enumerate(missense.iterrows())):
    if r.GeneSymbol in sequences:
        s = sequences[r.GeneSymbol]
        loc = int(r["loc"]) - 1
        if loc >= len(s):
            continue
        if s[loc] == aatable[r.og]:
            matches[i] = 1
        

In [None]:
missense = missense.assign(match=matches)

In [None]:
missense

In [None]:
df = missense[missense.match == 1]

In [None]:
df

In [None]:
df = df.loc[(df['ClinicalSignificance'] == 'Uncertain significance')|
            (df['ClinicalSignificance'] == 'Likely benign')|
            (df['ClinicalSignificance'] == 'Benign')|
            (df['ClinicalSignificance'] == 'Pathogenic')|
            (df['ClinicalSignificance'] == 'Likely pathogenic')|
            (df['ClinicalSignificance'] == 'Benign/Likely benign')|
            (df['ClinicalSignificance'] == 'Pathogenic/Likely pathogenic')]

In [None]:
df

In [None]:
df = df.assign(label=(df.ClinicalSignificance.str.contains("pathogenic")) | \
               (df.ClinicalSignificance.str.contains("Pathogenic")))

In [None]:
df