In [1]:
from CADA.paths import DATA_DIRECTORY
import collections
import os
import pandas as pd
import re
import pickle

with open('../data/raw/ids/hpo_old_new.dict', 'rb') as handle:
    hpo_dict = pickle.load(handle)

with open('../data/raw/ids/hpo_id_name.dict', 'rb') as handle:
    hpo_id_name = pickle.load(handle)

with open('../data/raw/ids/gene_name_id.dict', 'rb') as handle:
    gene_name_id = pickle.load(handle)



## 1. Parse clinvar 'Pathogenic' and 'Likely pathogenic' submissions from submission_summary.txt

https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/

In [87]:
# parse submissions from Clinvar submission summary
submission_summary ='../data/raw/clinvar/submission_summary.txt'
clinvar_summary_data = collections.defaultdict(dict)


# data cleaning, keep submissions with known'Pathogenic' and 'Likely pathogenic' mutations
with open(submission_summary, 'r') as infile:
        content = infile.read().splitlines()[16:]
        content = [x.split('\t') for x in content]
        for line in content:
            # parse case id, submitter, gene
            clinvar_patient =  'Patient:'+line[10].split('.')[0]
            submitter= line[9]
            gene = line[11]
            # parse case significance
            significance = line[1]
                
            # only include case with 'Pathogenic' and 'Likely pathogenic' significance
            if significance in ['Pathogenic', 'Likely pathogenic']:
                if gene != '-' and gene in gene_name_id:
                    clinvar_summary_data[clinvar_patient]['gene_id'] = gene_name_id[gene]
                    clinvar_summary_data[clinvar_patient]['submitter'] = submitter
                                     
                                       
df_submission = pd.DataFrame.from_dict(clinvar_summary_data, orient='index')


In [88]:
df_submission

Unnamed: 0,gene_id,submitter
Patient:SCV000020155,Entrez:9907,OMIM
Patient:SCV000020156,Entrez:9907,OMIM
Patient:SCV000020158,Entrez:55572,OMIM
Patient:SCV000680696,Entrez:55572,GeneDx
Patient:SCV000020159,Entrez:55572,OMIM
...,...,...
Patient:SCV001164575,Entrez:57096,"Broad Institute Rare Disease Group,Broad Insti..."
Patient:SCV001164576,Entrez:57096,"Broad Institute Rare Disease Group,Broad Insti..."
Patient:SCV001164581,Entrez:1756,"Broad Institute Rare Disease Group,Broad Insti..."
Patient:SCV001164584,Entrez:4703,"Broad Institute Rare Disease Group,Broad Insti..."


## 2. Parse HPO terms from ClinVarFullRelease

https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/

In [None]:
# cmd = 'grep '^\s*<ClinVarAccession Acc="SCV\|^\s*<XRef.*ID="HP:' ClinVarFullRelease_2020-06.xml > full_release.txt'
# os.system(cmd)

In [58]:
full_release = 'full_release.txt'
clinvar_data = collections.defaultdict(dict)

out_tsv = 'clinvar_submissions.tsv'


with open(full_release, 'r') as infile:
        content = infile.read().split('ClinVarAccession')
        for i in content[1:]:
            if 'SCV' in i and 'HP:' in i:
                matchacc = re.match(' Acc="(SCV\d+)"', i)
                clinvar_patient = 'Patient:'+matchacc.group(1)
                hpo_ids = []
                hpo_names = []
                matchhpo = re.finditer('(HP:\d+)', i)
                for hpo in matchhpo:
                    hpo_ids.append(hpo_dict.get(hpo.group(), hpo.group()))
                    hpo_ids = list(set(hpo_ids))
                if len(hpo_ids) > 0 :
                    clinvar_data[clinvar_patient]['features'] = ','.join(hpo_ids)


df_full = pd.DataFrame.from_dict(clinvar_data, orient='index')
df_full

Unnamed: 0,features
Patient:SCV000045941,HP:0001627
Patient:SCV000077558,"HP:0000104,HP:0000707,HP:0000717,HP:0001263,HP..."
Patient:SCV000077566,"HP:0002311,HP:0002474,HP:0008619"
Patient:SCV000077569,HP:0001250
Patient:SCV000077571,HP:0001263
...,...
Patient:SCV001430295,"HP:0000107,HP:0005562"
Patient:SCV001430296,"HP:0000107,HP:0005562"
Patient:SCV001430297,"HP:0000107,HP:0005562"
Patient:SCV001430888,"HP:0005584,HP:0000256"


In [83]:
df = pd.merge(df_full, df_submission, how = 'inner', left_index=True, right_index=True)
df

Unnamed: 0,features,gene_id,submitter
Patient:SCV000148377,"HP:0010535,HP:0001319,HP:0001263,HP:0000750,HP...",Entrez:27245,Whole genome laboratory; Baylor College of Med...
Patient:SCV000148378,"HP:0010535,HP:0001319,HP:0001263,HP:0000750,HP...",Entrez:27245,Whole genome laboratory; Baylor College of Med...
Patient:SCV000148379,"HP:0010535,HP:0001319,HP:0001263,HP:0000750,HP...",Entrez:27245,Whole genome laboratory; Baylor College of Med...
Patient:SCV000189113,"HP:0001249,HP:0001319,HP:0001263,HP:0000750,HP...",Entrez:5813,Whole genome laboratory; Baylor College of Med...
Patient:SCV000189114,"HP:0001263,HP:0000750,HP:0001319,HP:0001249",Entrez:5813,Whole genome laboratory; Baylor College of Med...
...,...,...,...
Patient:SCV001162809,"HP:0000252,HP:0009109,HP:0001250,HP:0001263,HP...",Entrez:3028,"Institute of Human Genetics,Klinikum rechts de..."
Patient:SCV001162814,"HP:0001876,HP:0001653,HP:0000952,HP:0001250,HP...",Entrez:8085,"Institute of Human Genetics,Klinikum rechts de..."
Patient:SCV001162819,"HP:0000112,HP:0000100",Entrez:51196,"Institute of Human Genetics,Klinikum rechts de..."
Patient:SCV001162824,"HP:0002460,HP:0002093,HP:0007141",Entrez:79628,"Institute of Human Genetics,Klinikum rechts de..."


## 3. Remove submissions with identical features, gene and submitter

In [84]:
df_filtered_identical = df.drop_duplicates()
df_filtered_identical

Unnamed: 0,features,gene_id,submitter
Patient:SCV000148377,"HP:0010535,HP:0001319,HP:0001263,HP:0000750,HP...",Entrez:27245,Whole genome laboratory; Baylor College of Med...
Patient:SCV000189113,"HP:0001249,HP:0001319,HP:0001263,HP:0000750,HP...",Entrez:5813,Whole genome laboratory; Baylor College of Med...
Patient:SCV000189114,"HP:0001263,HP:0000750,HP:0001319,HP:0001249",Entrez:5813,Whole genome laboratory; Baylor College of Med...
Patient:SCV000192942,HP:0002282,Entrez:1641,"Genetic Services Laboratory, University of Chi..."
Patient:SCV000193153,HP:0000365,Entrez:2706,"Genetic Services Laboratory, University of Chi..."
...,...,...,...
Patient:SCV001162796,"HP:0007183,HP:0012752,HP:0012748,HP:0040168",Entrez:1181,"Institute of Human Genetics,Klinikum rechts de..."
Patient:SCV001162809,"HP:0000252,HP:0009109,HP:0001250,HP:0001263,HP...",Entrez:3028,"Institute of Human Genetics,Klinikum rechts de..."
Patient:SCV001162814,"HP:0001876,HP:0001653,HP:0000952,HP:0001250,HP...",Entrez:8085,"Institute of Human Genetics,Klinikum rechts de..."
Patient:SCV001162819,"HP:0000112,HP:0000100",Entrez:51196,"Institute of Human Genetics,Klinikum rechts de..."
