In [59]:
import pandas as pd
import json
import gzip
import math

# Evidence data:
evidence_file = '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/sysbio_evidence-31-01-2019.tsv'
evidence_df = pd.read_csv(evidence_file, sep='\t')

# Publication info:
publication_file = '/Users/dsuveges/repositories/evidence_datasource_parsers/resources/sysbio_publication_info_nov2018.tsv'
publication_df = pd.read_csv(publication_file, sep='\t')

In [2]:
evidence_df.head()

Unnamed: 0,pmid,gene_set_name,target_id,disease_id,disease_name,score
0,28892060,Intestine Key Driver Genes,CD53,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.08337
1,28892060,Intestine Key Driver Genes,RHOH,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.116733
2,28892060,Intestine Key Driver Genes,DOCK2,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.122212
3,28892060,Intestine Key Driver Genes,FGR,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.132903
4,28892060,Intestine Key Driver Genes,NCKAP1L,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.130997


In [12]:
# Merging publication with evidence data:
merged = evidence_df.merge(publication_df.drop('pmid', axis=1), on='gene_set_name', how='outer', indicator=True)

# Checking if merging worked just fine:
if len(merged.loc[merged._merge != 'both']) != 0:
    print(f'{len(merged.loc[merged._merge != "both"])} rows could not be joined.')
    print(merged.loc[merged._merge != "both"])

In [7]:
len(publication_df)

6

In [57]:
publication_df.score_type.unique()

array(['p-value', 'rank', nan], dtype=object)

array(['Intestine Key Driver Genes',
       'Macrophage-specific Key Driver Genes',
       'Key causal regulators of LOAD-related microglia modules in all brain regions',
       'Key causal regulators of top 20 LOAD-related modules in the prefrontal cortex',
       'Key Drivers of CHD Causal Module',
       'Genes prioritised from module m109'], dtype=object)

In [78]:
def renormalize(n, start_range, new_range = [0.5,1]):
    """
    A function to scale a value from a given range to a new range.
    
    Apply the function f(x) to n using and old (start_range) and a new range
    where f(x) = (dNewRange / dOldRange * (n - old_range_lower_bound)) + new_lower
    """
    
    delta1 = start_range[1] - start_range[0]
    delta2 = new_range[1] - new_range[0]
    
    max_new_range = max(new_range)
    min_new_range = min(new_range)

    if delta1 or delta2:
        try:
            normalized = (delta2 * (n - start_range[0]) / delta1) + new_range[0]
        except ZeroDivisionError:
            normalized = new_range[0]
    else:
        normalized = n
    
    # The formula results in values slightly smaller and larger than the boundaries of the new range
    if normalized > max_new_range:
        return max_new_range
    
    elif normalized < min_new_range:
        return min_new_range

    return round(normalized,4)


def generate_score(row):
    """
    Score generation depends on the score type.
    """
    score_type = row['score_type']
    score = row['score']
    min_score = row['min_score']
    max_score = row['max_score']
    
    if score_type == 'p-value':
        parsed_score = renormalize(math.log10(score), [math.log10(max_score), math.log10(min_score)])
    elif score_type == 'rank':
        parsed_score = renormalize(score, [min_score, max_score])
    else:
        parsed_score = 0.75
        
    return parsed_score


In [67]:
def generate_score(row):
    """
    Score generation depends on the score type.
    """
    score_type = row['score_type']
    score = row['score']
    min_score = row['min_score']
    max_score = row['max_score']
    
    if score_type == 'p-value':
        parsed_score = renormalize(math.log10(score), [math.log10(max_score), math.log10(min_score)], [0.5, 1])
    elif score_type == 'rank':
        parsed_score = renormalize(score, [min_score, max_score], [0.5, 1])
    else:
        parsed_score = 0.75
        
    return parsed_score
                                                    



merged['resourceScore'] = merged.apply(generate_score, axis=1)
merged.head()

Unnamed: 0,pmid,gene_set_name,target_id,disease_id,disease_name,score,method,score_type,min_score,max_score,_merge,resourceScore
0,28892060,Intestine Key Driver Genes,CD53,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.08337,"Genomic (GWAS, eQTL and cis-regulatory element...",rank,0.0834,0.981,both,0.5
1,28892060,Intestine Key Driver Genes,RHOH,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.116733,"Genomic (GWAS, eQTL and cis-regulatory element...",rank,0.0834,0.981,both,0.5186
2,28892060,Intestine Key Driver Genes,DOCK2,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.122212,"Genomic (GWAS, eQTL and cis-regulatory element...",rank,0.0834,0.981,both,0.5216
3,28892060,Intestine Key Driver Genes,FGR,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.132903,"Genomic (GWAS, eQTL and cis-regulatory element...",rank,0.0834,0.981,both,0.5276
4,28892060,Intestine Key Driver Genes,NCKAP1L,http://www.ebi.ac.uk/efo/EFO_0003767,Inflammatory bowel disease,0.130997,"Genomic (GWAS, eQTL and cis-regulatory element...",rank,0.0834,0.981,both,0.5265


In [77]:
out_file = 'cicaful.json.gz'
# Updating dataframe:
(
    merged
    .assign(diseaseFromSourceMappedId = merged.disease_id.apply(lambda x: x.split('/')[-1]),
            datasourceId = 'sysbio',
            datatypeId = 'affected_pathway',
            literature = merged.pmid.apply(lambda x: [x]),
            pathways = merged.gene_set_name.apply(lambda x: [{'name': x}])
           )
    .rename(columns={
        'target_id': 'targetFromSourceId',
        'disease_name': 'diseaseFromSource',
        'method': 'studyOverview'
    })
    .drop(['_merge', 'max_score', 'min_score', 'score_type', 'score', 'disease_id', 'pmid','gene_set_name'], axis=1)
    .to_json(out_file, compression='gzip', orient='records', lines=True)
)

In [72]:
merged.disease_id.str.split('/')

['http:', '', 'www.ebi.ac.uk', 'efo', 'EFO_0003767']

Unnamed: 0,gene_set_name,target_id,disease_name,method,resourceScore
0,Intestine Key Driver Genes,CD53,Inflammatory bowel disease,"Genomic (GWAS, eQTL and cis-regulatory element...",0.5000
1,Intestine Key Driver Genes,RHOH,Inflammatory bowel disease,"Genomic (GWAS, eQTL and cis-regulatory element...",0.5186
2,Intestine Key Driver Genes,DOCK2,Inflammatory bowel disease,"Genomic (GWAS, eQTL and cis-regulatory element...",0.5216
3,Intestine Key Driver Genes,FGR,Inflammatory bowel disease,"Genomic (GWAS, eQTL and cis-regulatory element...",0.5276
4,Intestine Key Driver Genes,NCKAP1L,Inflammatory bowel disease,"Genomic (GWAS, eQTL and cis-regulatory element...",0.5265
...,...,...,...,...,...
406,Genes prioritised from module m109,SKI,Cognitive decline,Data from 478 participants of Religious Order ...,0.7500
407,Genes prioritised from module m109,SMAD7,Cognitive decline,Data from 478 participants of Religious Order ...,0.7500
408,Genes prioritised from module m109,TCF7L1,Cognitive decline,Data from 478 participants of Religious Order ...,0.7500
409,Genes prioritised from module m109,TMEM184B,Cognitive decline,Data from 478 participants of Religious Order ...,0.7500
