In [1]:
%%bash


gsutil cp -r gs://otar000-evidence_input/Gene2Phenotype/json/gene2phenotype-06-11-2020.json.gz \
    /Users/dsuveges/project_data/ot/evidence_input/21.02/gene2phenotype
    
    
gsutil cp -r gs://otar000-evidence_input/Gene2Phenotype/json/gene2phenotype-22-01-2021.json.gz \
    /Users/dsuveges/project_data/ot/evidence_input/20.11/gene2phenotype

Copying gs://otar000-evidence_input/Gene2Phenotype/json/gene2phenotype-06-11-2020.json.gz...
/ [0 files][    0.0 B/287.8 KiB]                                                / [1 files][287.8 KiB/287.8 KiB]                                                
Operation completed over 1 objects/287.8 KiB.                                    
Copying gs://otar000-evidence_input/Gene2Phenotype/json/gene2phenotype-22-01-2021.json.gz...
/ [0 files][    0.0 B/269.8 KiB]                                                / [1 files][269.8 KiB/269.8 KiB]                                                
Operation completed over 1 objects/269.8 KiB.                                    


In [26]:
import gzip
import pandas as pd
import json 
import requests

def parse_data(source_df):
    """
    Extracting and parsing target/disease and study information
    """
    
    df = pd.DataFrame(source_df['unique_association_fields'].tolist())
    df.target_id = df.target_id.apply(lambda x: x.split('/')[-1])
    df.disease_id = df.disease_id.apply(lambda x: x.split('/')[-1])
    df['study'] = df.apply(lambda row: f"{row['allelic_requirement']}_{row['gene_panel']}", axis=1)
    df.drop(['original_disease_label','mutation_consequence','allelic_requirement', 'gene_panel'], inplace=True, axis=1)
    
    return(df)


def get_first_report(df, title='Gene2Phenotype'):
    print(f'Processing {title}')
    print(f'\tNumber of evidence: {len(df)}')
    print(f'\tNumber of unique evidence: {len(df.drop_duplicates())}')
    print(f'\tNumber of unique associations: {len(df[["target_id","disease_id"]].drop_duplicates())}\n')

    
    
# These are the files to be read:
new_file = '/Users/dsuveges/project_data/ot/evidence_input/21.02/gene2phenotype/gene2phenotype-22-01-2021.json.gz'
old_file = '/Users/dsuveges/project_data/ot/evidence_input/20.11/gene2phenotype/gene2phenotype-06-11-2020.json.gz'

# Reading files:
new_df = parse_data(pd.read_json(new_file, lines=True))
old_df = parse_data(pd.read_json(old_file, lines=True))

# Generate report:
get_first_report(old_df, title='Gene2Phenotype 20.11 release')
get_first_report(new_df, title='Gene2Phenotype 21.02 release')

Processing Gene2Phenotype 20.11 release
	Number of evidence: 2880
	Number of unique evidence: 2857
	Number of unique associations: 2284

Processing Gene2Phenotype 21.02 release
	Number of evidence: 2605
	Number of unique evidence: 2600
	Number of unique associations: 2110



In [27]:
# Pooling evidence into associations:
new_pooled = new_df.groupby(['target_id','disease_id']).apply(lambda df: df.study.tolist()).rename("new_studies")
old_pooled = old_df.groupby(['target_id','disease_id']).apply(lambda df: df.study.tolist()).rename("old_studies")

old_pooled = old_pooled.to_frame().reset_index(level=1).reset_index(level=0)
new_pooled = new_pooled.to_frame().reset_index(level=1).reset_index(level=0)

# Merging associations:
merged = old_pooled.merge(new_pooled, how='outer', on=['target_id','disease_id'], indicator=True)
merged.head()

Unnamed: 0,target_id,disease_id,old_studies,new_studies,_merge
0,ENSG00000000419,Orphanet_137,"[biallelic_DD, biallelic_Skin]","[biallelic_DD, biallelic_Skin]",both
1,ENSG00000001497,HP_0001249,[hemizygous_DD],[hemizygous_DD],both
2,ENSG00000001630,Orphanet_91492,[biallelic_Eye],[biallelic_Eye],both
3,ENSG00000003400,Orphanet_3261,[monoallelic_Skin],[monoallelic_Skin],both
4,ENSG00000004399,Orphanet_570,"[monoallelic_DD, monoallelic_Eye]","[monoallelic_DD, monoallelic_Eye]",both


In [28]:
merged._merge.value_counts()

both          2071
left_only      213
right_only      39
Name: _merge, dtype: int64

### Summary of the changes:

* There are 39 new associations in the 21.02 release
* 213 associations were lost comparing to the 20.11 release.

## Checking with associations

In [13]:
association_file = '/Users/dsuveges/project_data/ot/output/21.02/21.02_association_data.json.gz'

associations = []
with gzip.open(association_file, 'r') as f:
    for line in f:
        data = json.loads(line)
        
        # We only care about direct interactions:
        if not data['is_direct']: 
            continue
            
        parsed = data['evidence_count']['datasources']
        
        # Adding more details:
        parsed.update({
            'target_id': data['target']['id'],
            'target_symbol': data['target']['gene_info']['symbol'],
            'disease_id': data['disease']['id'],
            'disease_label': data['disease']['efo_info']['label']
        }) 

        associations.append(parsed)
        
        
associations_df = pd.DataFrame(associations)
print(f"Number of direct interactions: {len(associations_df)}")
associations_df.head()

Number of direct interactions: 2009499


Unnamed: 0,expression_atlas,europepmc,slapenrich,crispr,progeny,intogen,cancer_gene_census,eva_somatic,sysbio,gwas_catalog,...,phewas_catalog,eva,gene2phenotype,postgap,reactome,uniprot_somatic,target_id,target_symbol,disease_id,disease_label
0,0.0,69740.0,230.0,7.0,11.0,14.0,187.0,105.0,0.0,0.0,...,8.0,58.0,0.0,0.0,557.0,0.0,ENSG00000146648,EGFR,EFO_0000616,neoplasm
1,0.0,561.0,115.0,2.0,0.0,3.0,158.0,27.0,0.0,0.0,...,7.0,1001.0,1.0,0.0,1.0,0.0,ENSG00000105976,MET,EFO_0000616,neoplasm
2,0.0,6665.0,480.0,7.0,11.0,87.0,260.0,715.0,0.0,0.0,...,0.0,220.0,1.0,0.0,43.0,0.0,ENSG00000121879,PIK3CA,EFO_0000616,neoplasm
3,0.0,49079.0,213.0,7.0,11.0,12.0,153.0,85.0,0.0,0.0,...,2.0,56.0,0.0,0.0,557.0,0.0,ENSG00000146648,EGFR,EFO_0000311,cancer
4,0.0,29411.0,184.0,7.0,11.0,11.0,126.0,85.0,0.0,0.0,...,1.0,57.0,0.0,0.0,2.0,0.0,ENSG00000146648,EGFR,EFO_0006858,epithelial neoplasm


In [29]:
# Function to retrieve annotation:
fetch_symbol = lambda x: requests.get(f'http://rest.ensembl.org/lookup/id/{x}?content-type=application/json').json()['display_name']
fetch_label = lambda x: requests.get(f'https://www.ebi.ac.uk/ols/api/terms?short_form={x}').json()['_embedded']['terms'][0]['label']


# Get all associations that changed from release:
changed_assoc = merged.loc[merged._merge != 'both']
changed_assoc['assoc_changed'] = merged._merge.apply(lambda x: 'Gained' if x == 'right_only' else "Lost")

# Dropping indicator column:
changed_assoc.drop(['_merge'], inplace=True, axis=1)

# Mapping labels/symbols;
changed_assoc['disease_label'] = changed_assoc.disease_id.apply(fetch_label)

changed_assoc['gene_symbol'] = changed_assoc.target_id.apply(fetch_symbol)

changed_assoc.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-v

Unnamed: 0,target_id,disease_id,old_studies,new_studies,assoc_changed,disease_label,gene_symbol
6,ENSG00000004838,Orphanet_244,[biallelic_DD],,Lost,Primary ciliary dyskinesia,ZMYND10
11,ENSG00000005339,EFO_0010252,[monoallelic_DD],,Lost,Menke-Hennekam syndrome 1,CREBBP
19,ENSG00000006530,Orphanet_91492,[biallelic_Eye],,Lost,Early-onset non-syndromic cataract,AGK
22,ENSG00000007062,Orphanet_1872,[biallelic_Eye],,Lost,Cone rod dystrophy,PROM1
25,ENSG00000007168,Orphanet_99796,[monoallelic_DD],,Lost,Subcortical band heterotopia,PAFAH1B1


In [30]:
associations_merged =(
    changed_assoc
    .merge(associations_df.drop(['target_symbol','disease_label'], axis=1), on=['disease_id','target_id'], how='left')
)

print(f"Number of rows: {len(associations_merged)}")
associations_merged.head()


Number of rows: 252


Unnamed: 0,target_id,disease_id,old_studies,new_studies,assoc_changed,disease_label,gene_symbol,expression_atlas,europepmc,slapenrich,...,ot_genetics_portal,chembl,genomics_england,clingen,phewas_catalog,eva,gene2phenotype,postgap,reactome,uniprot_somatic
0,ENSG00000004838,Orphanet_244,[biallelic_DD],,Lost,Primary ciliary dyskinesia,ZMYND10,0.0,6.0,0.0,...,0.0,0.0,5.0,0.0,0.0,64.0,0.0,0.0,0.0,0.0
1,ENSG00000005339,EFO_0010252,[monoallelic_DD],,Lost,Menke-Hennekam syndrome 1,CREBBP,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000006530,Orphanet_91492,[biallelic_Eye],,Lost,Early-onset non-syndromic cataract,AGK,0.0,2.0,0.0,...,0.0,0.0,1.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0
3,ENSG00000007062,Orphanet_1872,[biallelic_Eye],,Lost,Cone rod dystrophy,PROM1,0.0,10.0,0.0,...,0.0,0.0,1.0,0.0,0.0,124.0,0.0,0.0,0.0,0.0
4,ENSG00000007168,Orphanet_99796,[monoallelic_DD],,Lost,Subcortical band heterotopia,PAFAH1B1,0.0,20.0,0.0,...,0.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0


In [31]:
associations_merged.to_csv('G2P_associations_merged.tsv', index=False, sep='\t')