In [60]:
import pandas as pd

evidence = '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/crispr_evidence.tsv'
description = '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/crispr_descriptions.tsv'
cell_lines = '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/crispr_cell_lines.tsv'


evidence_df = pd.read_csv(evidence, sep='\t')
description_df = pd.read_csv(description, sep='\t')
cell_lines_df = pd.read_csv(cell_lines, sep='\t')


# Merging description with cell types and tissue:
tissue_desc = description_df.merge(cell_lines_df[['Name','Tissue']], left_on='tissue_or_cancer_type', how='inner', right_on='Tissue')
cell_desc = description_df.merge(cell_lines_df[['Name','Cancer Type']], left_on='tissue_or_cancer_type', how='inner', right_on='Cancer Type')

# Concatenating annotation:
merged_annotation = pd.concat([tissue_desc, cell_desc], ignore_index=True)

# Aggregating names accross disease/targets:
pooled_annotation = merged_annotation.groupby(['efo_id', 'tissue_or_cancer_type', 'method']).agg(
            {'Name': lambda x: list(x)}).reset_index()

# Updating columns:
pooled_annotation = (
    pooled_annotation
    .drop(['method',],axis=1)
    .rename(columns={
        'efo_id':'diseaseFromSourceMappedId',
        'Name': 'diseaseCellLines',
        'tissue_or_cancer_type': 'diseaseFromSource',
    })
)

##
## Parsing crispr evidence
##

# Some columns from the evidence file is not needed:
evidence_df = (
    evidence_df
    .drop(['pmid','gene_set_name', 'disease_name'], axis=1)
    .rename(columns={
        'target_id': 'targetFromSourceId', 
        'disease_id':'diseaseFromSourceMappedId',
        'score': 'resourceScore',
    })
)
    
# Replace some target ids: 
evidence_df.targetFromSourceId = evidence_df.targetFromSourceId.apply(lambda x: CRISPR_SYMBOL_MAPPING[x] if x in CRISPR_SYMBOL_MAPPING else x)

# Convert score:
evidence_df.resourceScore = evidence_df.resourceScore/100

# Merging evidence and annotation:
annotated_evidence = evidence_df.merge(pooled_annotation, on='diseaseFromSourceMappedId', how='outer',indicator=True)

# # Checking if all disease term got matched:
annotated_evidence.loc[annotated_evidence._merge != 'both']
annotated_evidence.drop(['_merge'], inplace=True, axis=1)

# Update efo identifier:
annotated_evidence.diseaseFromSourceMappedId = annotated_evidence.diseaseFromSourceMappedId.str.extract('/([^/]+?)$', expand=False)

# Adding new columns:
annotated_evidence['datasourceId'] = 'crispr'
annotated_evidence['datatypeId'] = 'affected_pathway'


annotated_evidence.to_json(outputFile, compression='gzip', orient='records', lines=True)

Unnamed: 0,targetFromSourceId,diseaseFromSourceMappedId,resourceScore,diseaseFromSource,diseaseCellLines,datasourceId,datatypeId
0,ENSG00000110092,EFO_0000174,0.797813,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway
1,ENSG00000130725,EFO_0000174,0.585,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway
2,ENSG00000111142,EFO_0000174,0.579375,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway
3,ENSG00000152234,EFO_0000174,0.54125,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway
4,ENSG00000165501,EFO_0000174,0.525,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway


In [67]:

print(annotated_evidence.head().to_json(path_or_buf=None, compression='gzip', orient='records', lines=True))

{"targetFromSourceId":"ENSG00000110092","diseaseFromSourceMappedId":"EFO_0000174","resourceScore":0.797813,"diseaseFromSource":"Bone","diseaseCellLines":["CAL-72","ES4","ES5","ES8","EW-1","EW-16","EW-22","EW-7","MC-IXC","MHH-ES-1","SJSA-1","TC-71"],"datasourceId":"crispr","datatypeId":"affected_pathway"}
{"targetFromSourceId":"ENSG00000130725","diseaseFromSourceMappedId":"EFO_0000174","resourceScore":0.585,"diseaseFromSource":"Bone","diseaseCellLines":["CAL-72","ES4","ES5","ES8","EW-1","EW-16","EW-22","EW-7","MC-IXC","MHH-ES-1","SJSA-1","TC-71"],"datasourceId":"crispr","datatypeId":"affected_pathway"}
{"targetFromSourceId":"ENSG00000111142","diseaseFromSourceMappedId":"EFO_0000174","resourceScore":0.579375,"diseaseFromSource":"Bone","diseaseCellLines":["CAL-72","ES4","ES5","ES8","EW-1","EW-16","EW-22","EW-7","MC-IXC","MHH-ES-1","SJSA-1","TC-71"],"datasourceId":"crispr","datatypeId":"affected_pathway"}
{"targetFromSourceId":"ENSG00000152234","diseaseFromSourceMappedId":"EFO_0000174","re

In [46]:
CRISPR_SYMBOL_MAPPING={
    'CASC5': 'ENSG00000137812',
    'CIRH1A': 'ENSG00000141076',
    'EFTUD1': 'ENSG00000140598',
    'ENSG00000163660': 'ENSG00000163660',
    'KIAA0947': 'ENSG00000164151',
    'KIAA1432': 'ENSG00000107036',
    'NDNL2': 'ENSG00000185115',
    'SRPR': 'ENSG00000182934',
    'ZNF259': 'ENSG00000109917'
}

evidence_df.target_id.apply(lambda x: CRISPR_SYMBOL_MAPPING[x] if x in CRISPR_SYMBOL_MAPPING else x)


0       ENSG00000110092
1       ENSG00000130725
2       ENSG00000111142
3       ENSG00000152234
4       ENSG00000165501
             ...       
1841    ENSG00000108107
1842    ENSG00000163659
1843    ENSG00000072274
1844    ENSG00000133112
1845    ENSG00000175063
Name: target_id, Length: 1846, dtype: object

In [53]:
annotated_evidence = evidence_df.merge(pooled_annotation, left_on = 'disease_id', right_on='efo_id', how='outer',indicator=True)
annotated_evidence.loc[annotated_evidence._merge != 'both']

Unnamed: 0,target_id,disease_id,disease_name,score,efo_id,tissue_or_cancer_type,method,Name,_merge


In [57]:
# evidence_df.head()
pooled_annotation.head()

Unnamed: 0,efo_id,tissue_or_cancer_type,method,Name
0,http://purl.obolibrary.org/obo/MONDO_0002038,Head and Neck Carcinoma,64 targets were prioritised based on CRISPR sc...,"[A253, BB30-HNC, Detroit562, FADU, JHU-011, LB..."
1,http://purl.obolibrary.org/obo/MONDO_0021637,Central Nervous System,72 targets were prioritised based on CRISPR sc...,"[SF126, U251, 8-MG-BA, A172, AM-38, Becker, CA..."
2,http://purl.obolibrary.org/obo/MONDO_0044925,Oral Cavity Carcinoma,49 targets were prioritised based on CRISPR sc...,"[BHY, BICR10, BICR22, BICR78, CAL-27, CAL-33, ..."
3,http://www.ebi.ac.uk/efo/EFO_0000174,Bone,113 targets were prioritised based on CRISPR s...,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW..."
4,http://www.ebi.ac.uk/efo/EFO_0000178,Gastric Carcinoma,168 targets were prioritised based on CRISPR s...,"[23132-87, HGC-27, HSC-39, KATOIII, MKN28, NCI..."


In [72]:
import numpy as np
annotated_evidence['_merge'] = np.random.choice(['both','left_only','right_only'], size=len(annotated_evidence), replace=True)
annotated_evidence.head()

Unnamed: 0,targetFromSourceId,diseaseFromSourceMappedId,resourceScore,diseaseFromSource,diseaseCellLines,datasourceId,datatypeId,_merge
0,ENSG00000110092,EFO_0000174,0.797813,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway,right_only
1,ENSG00000130725,EFO_0000174,0.585,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway,both
2,ENSG00000111142,EFO_0000174,0.579375,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway,right_only
3,ENSG00000152234,EFO_0000174,0.54125,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway,right_only
4,ENSG00000165501,EFO_0000174,0.525,Bone,"[CAL-72, ES4, ES5, ES8, EW-1, EW-16, EW-22, EW...",crispr,affected_pathway,both


In [74]:
annotated_evidence = annotated_evidence[annotated_evidence._merge != 'both']