In [1]:
import pandas as pd
import requests

def fetch_from_graphql(target_id: str) -> pd.DataFrame:
    """Fetching associations from OT graphql API based on ensembl gene Id"""

    # Checking if the target is indeed an ensembl gene id:
    assert target_id.startswith('ENSG'), "Target Id must be an ensembl gene id"

    # QueryURL:
    query_url = 'https://api.platform.opentargets.org/api/v4/graphql'

    # Qery string:
    query = {
        "operationName": "TargetAssociationsQuery",
        "variables": {
            "ensemblId": target_id,
            "index": 0,
            "size": 10000,
            "sortBy": "score",
            "filter": "",
            "aggregationFilters": []
        },
        "query": """
            query TargetAssociationsQuery($ensemblId: String!, $index: Int!, $size: Int!, $filter: String, $sortBy: String!, $aggregationFilters: [AggregationFilter!]) {
                target(ensemblId: $ensemblId) {
                    id
                    approvedSymbol
                    approvedName
                    associatedDiseases(page: {index: $index, size: $size}, orderByScore: $sortBy, BFilter: $filter, aggregationFilters: $aggregationFilters) {
                        count
                        rows {
                            disease {
                                id
                                name
                                ancestors
                            }
                            score
                            datatypeScores {
                                componentId: id
                                score
                            }
                        }
                    }
                }
            }
        """
    }

    data = requests.post(query_url, json=query).json()  # This part is not partcularly rubust.
    return data


# This is a linearized version of our ontology:
## It is important to note this file accounts for only parent terms!
efos = 'https://platform.opentargets.org/data/ontology/efo_json/diseases_efo.jsonl'

# These strings will be looked for in the parent names:
relevant_disease_pattern = ['immuno', 'hemato', 'hemo']

# Fetching disease ontology:
diseases = (
    pd.read_json(efos, lines=True)
    .drop('parentIds', axis=1)
    .rename(columns={'id': 'diseaseId', 'name': 'diseaseName'})
    
    # Filtering for diseases for interest:
    .loc[lambda df: df.diseaseName.str.lower().str.contains('|'.join(relevant_disease_pattern))]
)


print(f'Number of diseases associated with hematology/immununology: {len(diseases)}')


Number of diseases associated with hematology/immununology: 539


In [7]:
target = 'ENSG00000115977'

# Fetching associations for a given target:
data = fetch_from_graphql(target)


In [8]:
raw_associations = (
    pd.DataFrame(data['data']['target']['associatedDiseases']['rows'])
    .assign(
        # Adding target columns:
        targetSymbol=data['data']['target']['approvedSymbol'],
        targetName=data['data']['target']['approvedName'],
        targetId=data['data']['target']['id'],
        # Parsing disease id:
        diseaseId=lambda df: df.disease.apply(lambda x: x['id']),
        # Parsing disease name:
        diseaseName=lambda df: df.disease.apply(lambda x: x['name']),
        # Parsing ancestors:
        diseaseAncestors=lambda df: df.apply(lambda row: row['disease']['ancestors'] + [row['diseaseId']], axis=1)
    )
)
print(f'Number of associated diseases for {target}: {len(raw_associations)}')


# We select only those associated diseases, where at least any of the ancestors is in our disease of interest list:
associated_diseases = (
    raw_associations[['diseaseId', 'diseaseAncestors']]
    .explode('diseaseAncestors')
    .merge(diseases.rename(columns={'diseaseId': 'diseaseAncestors', 'diseaseName': 'diseaseAncestorName'}), on='diseaseAncestors')
    .groupby('diseaseId')
    .agg({
        'diseaseAncestorName': lambda diseasenames: list(set(diseasenames)),
    })
    .reset_index()
)

##
## Parsing data type scores expecting all datatypes 
##

# List of all data types:
data_types = [
    'affected_pathway',
    'literature',
    'rna_expression',
    'animal_model',
    'somatic_mutation',
    'known_drug',
    'genetic_association'
]

def getword(data_types: list, data_type: str) -> float:
    """Returning the data_type score for a given data_type"""
    for x in data_types:
        if x['componentId'] == data_type:
            return x['score']
        
    return None

# Expressions to generate column for each datatype with the value of the datatype score:
kwarg = {data_type: eval(f'lambda df: df.datatypeScores.apply(getword, args=("{data_type}", ))') for data_type in data_types}

# Applying expression:
datatypes = (
    raw_associations
    .assign(**kwarg)
    .fillna(0)
    .drop(['disease', 'datatypeScores', 'diseaseName', 'diseaseAncestors', 'score', 'targetName', 'targetId', 'targetSymbol'], axis=1)
)

filtered_associations = (
    raw_associations
    # Filtering associations by disease of interest:
    .merge(associated_diseases, on='diseaseId', how='inner')
    
    # Joining with datatype scores:
    .merge(datatypes, on='diseaseId', how='left')

    # Select and order columns:
    .drop(['disease', 'datatypeScores', 'diseaseAncestors', 'disease'], axis=1)
)


filtered_associations.head()


Number of associated diseases for ENSG00000115977: 80


Unnamed: 0,score,targetSymbol,targetName,targetId,diseaseId,diseaseName,diseaseAncestorName,affected_pathway,literature,rna_expression,animal_model,somatic_mutation,known_drug,genetic_association
0,0.392863,AAK1,AP2 associated kinase 1,ENSG00000115977,EFO_0004509,hemoglobin measurement,"[hematological measurement, hemoglobin measure...",0.0,0.0,0.0,0,0,0,0.64623
1,0.388156,AAK1,AP2 associated kinase 1,ENSG00000115977,EFO_0004348,hematocrit,"[hematological measurement, hematocrit]",0.0,0.0,0.0,0,0,0,0.638488
2,0.23856,AAK1,AP2 associated kinase 1,ENSG00000115977,EFO_0004305,erythrocyte count,[hematological measurement],0.0,0.0,0.0,0,0,0,0.392413
3,0.205843,AAK1,AP2 associated kinase 1,ENSG00000115977,EFO_0007978,red blood cell density measurement,[hematological measurement],0.0,0.0,0.0,0,0,0,0.338596
4,0.001478,AAK1,AP2 associated kinase 1,ENSG00000115977,EFO_0000403,diffuse large B-cell lymphoma,"[hematopoietic and lymphoid system neoplasm, t...",0.0,0.012159,0.0,0,0,0,0.0


In [9]:
# List of all data types:
data_types = [
    'affected_pathway',
    'literature',
    'rna_expression',
    'animal_model',
    'somatic_mutation',
    'known_drug',
    'genetic_association'
]


In [10]:
# Expressions to generate column for each datatype with the value of the datatype score:
kwarg = {data_type: eval(f'lambda df: df.datatypeScores.apply(getword, args=("{data_type}", ))') for data_type in data_types}


In [11]:

kwarg

{'affected_pathway': <function __main__.<lambda>(df)>,
 'literature': <function __main__.<lambda>(df)>,
 'rna_expression': <function __main__.<lambda>(df)>,
 'animal_model': <function __main__.<lambda>(df)>,
 'somatic_mutation': <function __main__.<lambda>(df)>,
 'known_drug': <function __main__.<lambda>(df)>,
 'genetic_association': <function __main__.<lambda>(df)>}

In [12]:
(
    pd.read_json('ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform/22.06/output/etl/json/associationByDatatypeDirect/part-00000-81370132-3091-40ee-90c1-5d8e641953c3-c000.json', lines=True)
    .head()
)

Unnamed: 0,diseaseId,targetId,datatypeId,score,evidenceCount
0,DOID_0050890,ENSG00000006128,literature,0.018238,1
1,DOID_0050890,ENSG00000006210,literature,0.018238,1
2,DOID_0050890,ENSG00000007952,literature,0.028743,14
3,DOID_0050890,ENSG00000010256,literature,0.030397,1
4,DOID_0050890,ENSG00000010610,literature,0.044075,2


In [13]:
from functools import reduce

dataframes = [spark.read.csv(f).withcolumn('file', f.lit(f)).persist() for f in file_list]
concatenated = reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dataframes)

NameError: name 'file_list' is not defined

In [14]:
l = [10, 20, 6, 4323, 12, 32, 12, 342]

bp = [2, 4, 7]


[l[b[0]:b[1]] for b in zip([0] + bp, bp + [len(l)])]

[[10, 20], [6, 4323], [12, 32, 12], [342]]

In [24]:
import re

def make_trait_reported_string(s_raw):
    '''Takes the raw trait name and outputs transformed name'''

    # Replace any double spaces with single
    s_raw = re.sub(r' +', r' ', s_raw)

    # Assert no "|" in trait name
    assert "|" not in s_raw, f"Reported trait ({s_raw}), contains invalid character."

    # Split prefix
    parts = s_raw.split(': ', 1)

    # Move prefix to end if exists
    if len(parts) == 2:
        trait = " | ".join([parts[1], parts[0]])
    else:
        trait = s_raw

    # Capitalise the first letter
    trait = trait.capitalize()

    return trait


make_trait_reported_string('POCOK | kutya')

AssertionError: Reported trait (POCOK | kutya), contains invalid character.

In [26]:
 '{out_dir}/{data_type}/{exp_type}/{source}/{version}/data.parquet'.format(
    out_dir='out_dir',
    data_type='interval',
    exp_type='fantom5',
    source='andersson2014',
    version=12,
    cell_type='aggregate')

'out_dir/interval/fantom5/andersson2014/12/data.parquet'

In [27]:
spark.version

NameError: name 'spark' is not defined

In [28]:
(
    pd.read_csv('https://www.ebi.ac.uk/gwas/api/search/downloads/studies_alternative', sep='\t', row)
    .head()
)

Unnamed: 0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,PLATFORM [SNPS PASSING QC],ASSOCIATION COUNT,MAPPED_TRAIT,MAPPED_TRAIT_URI,STUDY ACCESSION,GENOTYPING TECHNOLOGY
0,2017-07-10,28416818,Christophersen IE,2017-04-17,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/28416818,Large-scale analyses of common and rare varian...,Atrial fibrillation,"15,979 European ancestry cases, 102,776 Europe...","8,180 Japanese ancestry cases, 28,612 Japanese...","Affymetrix, Illumina [11795432] (imputed)",10,atrial fibrillation,http://www.ebi.ac.uk/efo/EFO_0000275,GCST004295,Genome-wide genotyping array
1,2017-07-10,28416818,Christophersen IE,2017-04-17,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/28416818,Large-scale analyses of common and rare varian...,Atrial fibrillation,"15,979 European ancestry cases, 102,776 Europe...","3,666 cases, 139,852 controls","Affymetrix, Illumina [11795432] (imputed)",8,atrial fibrillation,http://www.ebi.ac.uk/efo/EFO_0000275,GCST004296,Genome-wide genotyping array
2,2017-07-10,28416818,Christophersen IE,2017-04-17,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/28416818,Large-scale analyses of common and rare varian...,Atrial fibrillation,"15,979 European ancestry cases, 102,776 Europe...",,"Affymetrix, Illumina [11795432] (imputed)",20,atrial fibrillation,http://www.ebi.ac.uk/efo/EFO_0000275,GCST004297,Genome-wide genotyping array
3,2017-07-10,28416818,Christophersen IE,2017-04-17,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/28416818,Large-scale analyses of common and rare varian...,Atrial fibrillation,"837 Japanese ancestry cases, 3,293 Japanese co...","8,180 Japanese ancestry cases, 28,612 Japanese...","Affymetrix, Illumina [11795432] (imputed)",1,atrial fibrillation,http://www.ebi.ac.uk/efo/EFO_0000275,GCST004298,Genome-wide genotyping array
4,2017-07-10,28416818,Christophersen IE,2017-04-17,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/28416818,Large-scale analyses of common and rare varian...,Atrial fibrillation,"641 African American cases, 5,234 African Amer...","447 African American cases, 442 African Americ...","Affymetrix, Illumina [11795432] (imputed)",1,atrial fibrillation,http://www.ebi.ac.uk/efo/EFO_0000275,GCST004299,Genome-wide genotyping array
