In [None]:
from CADA.paths import DATA_DIRECTORY
import collections
import os
import pandas as pd
import re
import pickle

with open('../data/raw/ids/hpo_old_new.dict', 'rb') as handle:
    hpo_dict = pickle.load(handle)

with open('../data/raw/ids/hpo_id_name.dict', 'rb') as handle:
    hpo_id_name = pickle.load(handle)

with open('../data/raw/ids/gene_name_id.dict', 'rb') as handle:
    gene_name_id = pickle.load(handle)



## 1. Parse clinvar 'Pathogenic' and 'Likely pathogenic' submissions from submission_summary.txt

https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/

In [None]:
# parse submissions from Clinvar submission summary
submission_summary ='../data/raw/clinvar/submission_summary.txt'
clinvar_summary_data = collections.defaultdict(dict)


# data cleaning, keep submissions with known'Pathogenic' and 'Likely pathogenic' mutations
with open(submission_summary, 'r') as infile:
        content = infile.read().splitlines()[16:]
        content = [x.split('\t') for x in content]
        for line in content:
            # parse case id, submitter, gene
            clinvar_patient =  'Patient:'+line[10].split('.')[0]
            submitter= line[9]
            gene = line[11]
            # parse case significance
            significance = line[1]
                
            # only include case with 'Pathogenic' and 'Likely pathogenic' significance
            if significance in ['Pathogenic', 'Likely pathogenic']:
                if gene != '-' and gene in gene_name_id:
                    clinvar_summary_data[clinvar_patient]['gene_id'] = gene_name_id[gene]
                    clinvar_summary_data[clinvar_patient]['submitter'] = submitter
                                     
                                       
df_submission = pd.DataFrame.from_dict(clinvar_summary_data, orient='index')


In [None]:
df_submission

## 2. Parse HPO terms from ClinVarFullRelease

https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/

In [None]:
# cmd = 'grep '^\s*<ClinVarAccession Acc="SCV\|^\s*<XRef.*ID="HP:' ClinVarFullRelease_2020-06.xml > full_release.txt'
# os.system(cmd)

In [None]:
full_release = 'full_release.txt'
clinvar_data = collections.defaultdict(dict)


with open(full_release, 'r') as infile:
        content = infile.read().split('ClinVarAccession')
        for i in content[1:]:
            if 'SCV' in i and 'HP:' in i:
                matchacc = re.match(' Acc="(SCV\d+)"', i)
                clinvar_patient = 'Patient:'+matchacc.group(1)
                hpo_ids = []
                hpo_names = []
                matchhpo = re.finditer('(HP:\d+)', i)
                for hpo in matchhpo:
                    hpo_ids.append(hpo_dict.get(hpo.group(), hpo.group()))
                    hpo_ids = list(set(hpo_ids))
                if len(hpo_ids) > 0 :
                    clinvar_data[clinvar_patient]['features'] = ','.join(hpo_ids)


df_full = pd.DataFrame.from_dict(clinvar_data, orient='index')
df_full

In [None]:
df = pd.merge(df_full, df_submission, how = 'inner', left_index=True, right_index=True)
df

## 3. Remove submissions with identical features, gene and submitter

In [None]:
df_filtered_identical = df.drop_duplicates()
df_filtered_identical