In [27]:
import gzip
import pandas as pd
import json 

def parse_data(source_df):
    """
    Extracting and parsing target/disease and study information
    """
    
    df = pd.DataFrame(source_df['unique_association_fields'].tolist())
    df.target_id = df.target_id.apply(lambda x: x.split('/')[-1])
    df['study'] = df.report_url.apply(lambda x: x.split('/')[-1])
    df.drop('report_url', inplace=True, axis=1)
    
    return(df)


def get_first_report(df, title='Clingen'):
    print(f'Processing {title}')
    print(f'\tNumber of evidence: {len(df)}')
    print(f'\tNumber of unique evidence: {len(df.drop_duplicates())}')
    print(f'\tNumber of unique associations: {len(df[["target_id","disease_id"]].drop_duplicates())}\n')

    
# These are the files to be read:
new_file = '/Users/dsuveges/project_data/ot/evidence_input/21.02/clingen/clingen-2021-01-18.json.gz'
old_file = '/Users/dsuveges/project_data/ot/evidence_input/20.11/clingen/clingen-2020-11-05.json.gz'

# Reading files:
new_df = parse_data(pd.read_json(new_file, lines=True))
old_df = parse_data(pd.read_json(old_file, lines=True))

# Generate report:
get_first_report(old_df, title='Clingen 20.11 release')
get_first_report(new_df, title='Clingen 21.02 release')


Processing Clingen 20.11 release
	Number of evidence: 1087
	Number of unique evidence: 1087
	Number of unique associations: 1073

Processing Clingen 21.02 release
	Number of evidence: 1189
	Number of unique evidence: 1189
	Number of unique associations: 1172



In [49]:
new_pooled = new_df.groupby(['target_id','disease_id']).apply(lambda df: df.study.tolist()).rename("new_studies")
old_pooled = old_df.groupby(['target_id','disease_id']).apply(lambda df: df.study.tolist()).rename("old_studies")

old_pooled = old_pooled.to_frame().reset_index(level=1).reset_index(level=0)
new_pooled = new_pooled.to_frame().reset_index(level=1).reset_index(level=0)



In [51]:
merged = old_pooled.merge(new_pooled, how='outer', on=['target_id','disease_id'], indicator=True)
merged.head()

Unnamed: 0,target_id,disease_id,old_studies,new_studies,_merge
0,ENSG00000004700,MONDO_0016248,[8701],[8701],both
1,ENSG00000004700,MONDO_0016419,[8700],[8700],both
2,ENSG00000004848,MONDO_0016021,[3a5357ce-2161-4a7f-a8e1-a1dba4550e57--2019-06...,[3a5357ce-2161-4a7f-a8e1-a1dba4550e57--2019-06...,both
3,ENSG00000005339,MONDO_0008393,[9554],[9554],both
4,ENSG00000005893,MONDO_0010281,[10060],[10060],both


## Associations that were lost

In [66]:
# Function to retrieve annotation:
fetch_symbol = lambda x: requests.get(f'http://rest.ensembl.org/lookup/id/{x}?content-type=application/json').json()['display_name']
fetch_label = lambda x: requests.get(f'https://www.ebi.ac.uk/ols/api/terms?short_form={x}').json()['_embedded']['terms'][0]['label']

lost_assoc = merged.loc[merged._merge == 'left_only']
lost_assoc['gene_symbol'] = lost_assoc.target_id.apply(fetch_symbol)
lost_assoc['disease_label'] = lost_assoc.disease_id.apply(fetch_label)

lost_assoc[['gene_symbol','disease_label']]

Unnamed: 0,gene_symbol,disease_label
253,SLC6A4,complex neurodevelopmental disorder
505,BIN1,"myopathy, centronuclear, 2"
697,SHROOM4,complex neurodevelopmental disorder
985,RELN,complex neurodevelopmental disorder
1014,ZNF81,non-syndromic X-linked intellectual disability


## Associations gained

In [79]:
changed_assoc =  merged.loc[merged._merge != 'both']
changed_assoc['assoc_changed'] = changed_assoc._merge.apply(lambda x: 'Association lost' if x == 'left_only' else 'Association gained')
changed_assoc['target_symbol'] = changed_assoc.target_id.apply(fetch_symbol)
changed_assoc['disease_label'] = changed_assoc.disease_id.apply(fetch_label)

(
    changed_assoc[['target_symbol', 'target_id',
                   'disease_label', 'disease_id',
                   'old_studies', 'new_studies',
                   'assoc_changed']]
    .to_csv('clingen_changes.tsv.gz', index=False, sep='\t', compression='infer')
)

In [78]:
%%bash

ls -lah
gzcat clingen_changes.tsv.gz | head -n10 | column -ts $'\t'

total 48
drwxrwxr-x   5 dsuveges  384566875   160B 15 Feb 15:33 .
drwxrwxr-x  31 dsuveges  384566875   992B 15 Feb 13:48 ..
drwxrwxr-x   3 dsuveges  384566875    96B 15 Feb 13:49 .ipynb_checkpoints
-rw-rw-r--   1 dsuveges  384566875    14K 15 Feb 15:33 New_clingen_data.ipynb
-rw-rw-r--   1 dsuveges  384566875   5.8K 15 Feb 15:33 clingen_changes.tsv.gz
target_id        disease_id     old_studies                                                    new_studies  _merge              assoc_changed  target_symbol                                   disease_label
ENSG00000108576  MONDO_0100038  ['cc4b0612-0a53-4e36-b737-5c0f49a387a1--2018-05-02T17:00:00']  left_only    Association lost    SLC6A4         complex neurodevelopmental disorder
ENSG00000136717  MONDO_0009709  ['c4b0f110-f8ff-4814-98c7-faa78a7cba16--2020-06-26T14:02:37']  left_only    Association lost    BIN1           myopathy, centronuclear, 2
ENSG00000158352  MONDO_0100038  ['a94d20f8-9909-48b2-9860-d17c6dc73ec3--2020-07-15T20:01:18'

## Checking with associations

In [90]:
association_file = '/Users/dsuveges/project_data/ot/output/21.02/21.02_association_data.json.gz'

associations = []
with gzip.open(association_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        if not data['is_direct']: 
            continue
            
        if data['evidence_count']['datasources']['clingen'] == 0:
            continue
            
        parsed = data['evidence_count']['datasources']
        
        # Adding more details:
        parsed.update({
            'target_id': data['target']['id'],
            'target_symbol': data['target']['gene_info']['symbol'],
            'disease_id': data['disease']['id'],
            'disease_label': data['disease']['efo_info']['label']
        }) 

        associations.append(parsed)
        
        
associations_df = pd.DataFrame(associations)
associations_df.head()

Unnamed: 0,expression_atlas,europepmc,slapenrich,crispr,progeny,intogen,cancer_gene_census,eva_somatic,sysbio,gwas_catalog,...,phewas_catalog,eva,gene2phenotype,postgap,reactome,uniprot_somatic,target_id,target_symbol,disease_info,disease_label
0,0.0,69740.0,230.0,7.0,11.0,14.0,187.0,105.0,0.0,0.0,...,8.0,58.0,0.0,0.0,557.0,0.0,ENSG00000146648,EGFR,EFO_0000616,neoplasm
1,0.0,561.0,115.0,2.0,0.0,3.0,158.0,27.0,0.0,0.0,...,7.0,1001.0,1.0,0.0,1.0,0.0,ENSG00000105976,MET,EFO_0000616,neoplasm
2,0.0,6665.0,480.0,7.0,11.0,87.0,260.0,715.0,0.0,0.0,...,0.0,220.0,1.0,0.0,43.0,0.0,ENSG00000121879,PIK3CA,EFO_0000616,neoplasm
3,0.0,49079.0,213.0,7.0,11.0,12.0,153.0,85.0,0.0,0.0,...,2.0,56.0,0.0,0.0,557.0,0.0,ENSG00000146648,EGFR,EFO_0000311,cancer
4,0.0,29411.0,184.0,7.0,11.0,11.0,126.0,85.0,0.0,0.0,...,1.0,57.0,0.0,0.0,2.0,0.0,ENSG00000146648,EGFR,EFO_0006858,epithelial neoplasm


In [102]:
associations_merged =(
    changed_assoc[['target_id', 'target_symbol',
                   'disease_id', 'disease_label',
                   'old_studies', 'new_studies',
                   'assoc_changed']]
    .merge(associations_df.drop(['disease_label','target_symbol'], axis=1), on=['disease_id','target_id'], how='left')
)
associations_merged.head()

Unnamed: 0,target_id,target_symbol,disease_id,disease_label,old_studies,new_studies,assoc_changed,expression_atlas,europepmc,slapenrich,...,ot_genetics_portal,chembl,genomics_england,clingen,phewas_catalog,eva,gene2phenotype,postgap,reactome,uniprot_somatic
0,ENSG00000108576,SLC6A4,MONDO_0100038,complex neurodevelopmental disorder,[cc4b0612-0a53-4e36-b737-5c0f49a387a1--2018-05...,,Association lost,,,,...,,,,,,,,,,
1,ENSG00000136717,BIN1,MONDO_0009709,"myopathy, centronuclear, 2",[c4b0f110-f8ff-4814-98c7-faa78a7cba16--2020-06...,,Association lost,,,,...,,,,,,,,,,
2,ENSG00000158352,SHROOM4,MONDO_0100038,complex neurodevelopmental disorder,[a94d20f8-9909-48b2-9860-d17c6dc73ec3--2020-07...,,Association lost,,,,...,,,,,,,,,,
3,ENSG00000189056,RELN,MONDO_0100038,complex neurodevelopmental disorder,[c654081c-5abd-4f86-a3f7-90d7f117b93d--2019-05...,,Association lost,,,,...,,,,,,,,,,
4,ENSG00000197779,ZNF81,MONDO_0019181,non-syndromic X-linked intellectual disability,[a3ada757-6500-46a1-a89e-42668bbdb934--2018-06...,,Association lost,,,,...,,,,,,,,,,


In [99]:
(
    associations_merged
    .loc[(associations_merged.assoc_changed == 'Association gained')&(~associations_merged.clingen.isna())]
    [[]]
)

Unnamed: 0,target_id,disease_id,old_studies,new_studies,assoc_changed,expression_atlas,europepmc,slapenrich,crispr,progeny,...,genomics_england,clingen,phewas_catalog,eva,gene2phenotype,postgap,reactome,uniprot_somatic,target_symbol,disease_label
5,ENSG00000004848,MONDO_0100038,,[ccf3ab49-ee8f-4980-aa04-ca13c21c124a--2020-12...,Association gained,0.0,20.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,ARX,complex neurodevelopmental disorder
9,ENSG00000036257,MONDO_0100038,,[fb529ace-86b5-4209-9974-9c07a1c32956--2021-01...,Association gained,0.0,2.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,CUL3,complex neurodevelopmental disorder
10,ENSG00000050555,MONDO_0100038,,[0ff89702-c97e-4ef9-91ce-2e37b66efd84--2020-09...,Association gained,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,LAMC3,complex neurodevelopmental disorder
15,ENSG00000072195,MONDO_0014418,,[3a8f91f1-da1e-489d-84e1-46aa543b3df9--2020-01...,Association gained,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,SPEG,"myopathy, centronuclear, 5"
17,ENSG00000083093,MONDO_0012565,,[3ebabbf1-1a43-4a57-b0b8-2b29bb57ade1--2019-08...,Association gained,0.0,0.0,0.0,0.0,0.0,...,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,PALB2,Fanconi anemia complementation group N
18,ENSG00000083123,MONDO_0023692,,[0b0d314c-7355-441c-a357-72ba3e566c57--2019-02...,Association gained,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,BCKDHB,maple syrup urine disease type 1B
20,ENSG00000100285,MONDO_0014836,,[8e1390ea-fd6c-4fa4-9b43-0eccff9e2829--2020-10...,Association gained,0.0,3.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,NEFH,Charcot-Marie-Tooth disease axonal type 2CC
24,ENSG00000108176,MONDO_0044304,,[500ad0d5-2871-4b25-a96f-0c0816524a4b--2021-01...,Association gained,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,DNAJC12,hyperphenylalaninemia due to DNAJC12 deficiency
26,ENSG00000108946,MONDO_0008057,,[fc16b2a9-c8dc-4a04-a029-2f6af049404d--2018-12...,Association gained,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,PRKAR1A,"Carney complex, type 1"
27,ENSG00000110436,MONDO_0014916,,[ebb83674-206f-461b-b98e-e12420208deb--2020-10...,Association gained,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,SLC1A2,"developmental and epileptic encephalopathy, 41"


In [103]:
associations_merged.to_csv('associations_merged.tsv', index=False, sep='\t')