In [115]:
import pandas as pd
import requests

def fetch_from_graphql(target_id: str) -> pd.DataFrame:
    """Fetching associations from OT graphql API based on ensembl gene Id"""

    # Checking if the target is indeed an ensembl gene id:
    assert target_id.startswith('ENSG'), "Target Id must be an ensembl gene id"

    # QueryURL:
    query_url = 'https://api.platform.opentargets.org/api/v4/graphql'

    # Qery string:
    query = {
        "operationName": "TargetAssociationsQuery",
        "variables": {
            "ensemblId": target_id,
            "index": 0,
            "size": 10000,
            "sortBy": "score",
            "filter": "",
            "aggregationFilters": []
        },
        "query": """
            query TargetAssociationsQuery($ensemblId: String!, $index: Int!, $size: Int!, $filter: String, $sortBy: String!, $aggregationFilters: [AggregationFilter!]) {
                target(ensemblId: $ensemblId) {
                    id
                    approvedSymbol
                    approvedName
                    associatedDiseases(page: {index: $index, size: $size}, orderByScore: $sortBy, BFilter: $filter, aggregationFilters: $aggregationFilters) {
                        count
                        rows {
                            disease {
                                id
                                name
                                ancestors
                            }
                            score
                            datatypeScores {
                                componentId: id
                                score
                            }
                        }
                    }
                }
            }
        """
    }

    data = requests.post(query_url, json=query).json()  # This part is not partcularly rubust.
    return data


# This is a linearized version of our ontology:
## It is important to note this file accounts for only parent terms!
efos = 'https://platform.opentargets.org/data/ontology/efo_json/diseases_efo.jsonl'

# These strings will be looked for in the parent names:
relevant_disease_pattern = ['immuno', 'hemato', 'hemo']

# Fetching disease ontology:
diseases = (
    pd.read_json(efos, lines=True)
    .drop('parentIds', axis=1)
    .rename(columns={'id': 'diseaseId', 'name': 'diseaseName'})
    
    # Filtering for diseases for interest:
    .loc[lambda df: df.diseaseName.str.lower().str.contains('|'.join(relevant_disease_pattern))]
)


print(f'Number of diseases associated with hematology/immununology: {len(diseases)}')


Number of diseases associated with hematology/immununology: 433


In [111]:
# Fetching associations for a given target:
data = fetch_from_graphql('ENSG00000122729')


In [148]:
raw_associations = (
    pd.DataFrame(data['data']['target']['associatedDiseases']['rows'])
    .assign(
        # Adding target columns:
        targetSymbol=data['data']['target']['approvedSymbol'],
        targetName=data['data']['target']['approvedName'],
        targetId=data['data']['target']['id'],
        # Parsing disease id:
        diseaseId=lambda df: df.disease.apply(lambda x: x['id']),
        # Parsing disease name:
        diseaseName=lambda df: df.disease.apply(lambda x: x['name']),
        # Parsing ancestors:
        diseaseAncestors=lambda df: df.apply(lambda row: row['disease']['ancestors'] + [row['diseaseId']], axis=1)
    )
)

# We select only those associated diseases, where at least any of the ancestors is in our disease of interest list:
associated_diseases = (
    raw_associations[['diseaseId', 'diseaseAncestors']]
    .explode('diseaseAncestors')
    .merge(diseases.rename(columns={'diseaseId': 'diseaseAncestors', 'diseaseName': 'diseaseAncestorName'}), on='diseaseAncestors')
    .groupby('diseaseId')
    .agg({
        'diseaseAncestorName': lambda diseasenames: list(set(diseasenames)),
    })
    .reset_index()
)

# Parsing datatypes and datatype scores: <- this is optional
datatypes = (
    raw_associations
    .explode('datatypeScores')
    .assign(
        datatype=lambda df: df.datatypeScores.apply(lambda x: x['componentId']),
        datatypeScore=lambda df: df.datatypeScores.apply(lambda x: x['score']),
    )
    .pivot(index='diseaseId', columns='datatype', values='datatypeScore')
    .reset_index()
    .fillna(0)
)


filtered_associations = (
    raw_associations
    # Filtering associations by disease of interest:
    .merge(associated_diseases, on='diseaseId', how='inner')
    
    # Joining with datatype scores:
    .merge(datatypes, on='diseaseId', how='left')

    # Select and order columns:
    .drop(['disease', 'datatypeScores', 'diseaseAncestorName', 'disease'], axis=1)
)


filtered_associations.head()


Unnamed: 0,score,targetSymbol,targetName,targetId,diseaseId,diseaseName,diseaseAncestors,animal_model,genetic_association,literature,rna_expression
0,0.697641,ACO1,aconitase 1,ENSG00000122729,EFO_0004509,hemoglobin measurement,"[EFO_0004747, EFO_0004503, EFO_0001444, EFO_00...",0.0,0.918053,0.0,0.0
1,0.696392,ACO1,aconitase 1,ENSG00000122729,EFO_0004348,hematocrit,"[EFO_0004503, EFO_0001444, EFO_0004348]",0.0,0.916409,0.0,0.0
2,0.685656,ACO1,aconitase 1,ENSG00000122729,EFO_0004305,erythrocyte count,"[EFO_0004306, EFO_0004503, EFO_0001444, EFO_00...",0.0,0.902282,0.0,0.0
3,0.345268,ACO1,aconitase 1,ENSG00000122729,EFO_0007978,red blood cell density measurement,"[EFO_0004503, EFO_0001444, EFO_0005047, EFO_00...",0.0,0.567939,0.0,0.0
4,0.305731,ACO1,aconitase 1,ENSG00000122729,EFO_0004528,mean corpuscular hemoglobin concentration,"[EFO_0004747, EFO_0004306, EFO_0004503, EFO_00...",0.0,0.502905,0.0,0.0


In [149]:
len(filtered_associations)

19