# Filter OpenTargets L2G Query Results

The purpose of this file is to show how to cleanup the OpenTarget query results to get a more refined, high-quality set of GWAS phenotypes to use for downstream program enrichment analysis.

This outputs `OpenTargets_L2G_Filtered.csv`, which is what we use to perform GWAS-related benchmarking on programs based on MuData input objects.  This does not need to be re-run unless changes to the GWAS filtering steps are desired.

In [1]:
# Basic Imports
import pandas as pd
import numpy as np
import json

In [2]:
# Read in the query results
l2g = pd.read_csv("./OpenTargets_L2G_noQC.csv.gz", compression='gzip')
l2g.head()

Unnamed: 0,study_id,chrom,pos,ref,alt,study_id_1,ancestry_initial,ancestry_replication,n_cases,n_initial,...,has_sumstats,num_assoc_loci,source,trait_reported,trait_efos,trait_category,gene_name,y_proba_full_model,pval,rn
0,GCST000392,10,88263276,T,C,GCST000392,{'list': array([{'element': 'European=16559'}]...,{'list': array([{'element': 'European=13279'}]...,7514,16559,...,False,31,GCST,Type 1 diabetes,"{'list': array([{'element': 'MONDO_0005147'}],...",pancreas disease,RNLS,0.679374,1.0000000000000001e-28,1
1,GCST000879,6,90263440,C,A,GCST000879,{'list': array([{'element': 'European=21389'}]...,{'list': array([{'element': 'European=30962'}]...,6333,21389,...,False,70,GCST,Crohn's disease,"{'list': array([{'element': 'EFO_0000384'}], d...",gastrointestinal disease,BACH2,0.753644,5e-09,1
2,GCST000879,17,34266646,A,G,GCST000879,{'list': array([{'element': 'European=21389'}]...,{'list': array([{'element': 'European=30962'}]...,6333,21389,...,False,70,GCST,Crohn's disease,"{'list': array([{'element': 'EFO_0000384'}], d...",gastrointestinal disease,CCL2,0.697196,2e-13,1
3,GCST000879,1,7819003,C,T,GCST000879,{'list': array([{'element': 'European=21389'}]...,{'list': array([{'element': 'European=30962'}]...,6333,21389,...,False,70,GCST,Crohn's disease,"{'list': array([{'element': 'EFO_0000384'}], d...",gastrointestinal disease,PER3,0.718292,7e-09,1
4,GCST000998,1,109279544,G,A,GCST000998,{'list': array([{'element': 'European=86995'}]...,{'list': array([{'element': 'European=56682'}]...,22233,86995,...,True,18,GCST,Coronary heart disease,"{'list': array([{'element': 'EFO_0001645'}], d...",cardiovascular disease,CELSR2,0.485513,2.89e-10,1


In [3]:
def process_json_format_l2g_columns(row, column_name):
    """
    Extracts a comma-separated list of IDs from a string representation of a list of dictionaries in a DataFrame row.

    Parameters:
        row (pandas.Series): A row of a pandas DataFrame.
        column_name (str): The name of the column containing the string representation.

    Returns:
        str: A comma-separated list of IDs extracted from the string representation.
             Returns None if the string representation is not properly formatted or if an error occurs during extraction.
    """
    try:
        elements_str = row[column_name]
        start_index = elements_str.find("[{")  # Find the starting index of the list
        end_index = elements_str.find("}]") + 2  # Find the ending index of the list and include the closing bracket
        
        if start_index != -1 and end_index != -1:  # Ensure both start and end indices are found
            elements_list_str = elements_str[start_index:end_index]
            
            # Remove newline characters and convert to a list of dictionaries
            elements_list = eval(elements_list_str.replace('\n', '').replace('array', 'list'))

            ids = [elem['element'] for elem in elements_list]
            ids.sort()
            return ', '.join(ids)
        else:
            return None  # Return None if start or end index not found
    except Exception as e:
        print(f"Error: {e}, Row: {row}")
        return None

In [4]:
# convert the trait_efos into just a flat list of EFO IDs rather than json formatted
l2g['trait_efos'] = l2g.apply(lambda row: process_json_format_l2g_columns(row, "trait_efos"), axis=1)

In [5]:
# Remove GWAS with lots of EFO IDs
filtered_l2g = l2g[l2g['trait_efos'].fillna('').apply(lambda x: x.count(',')) <= 2]

#remove double quotes present in some trait_reported rows, but not others
filtered_l2g.loc[:, 'trait_reported'] = filtered_l2g['trait_reported'].str.replace('"', '')

# Remove unusual traits, such as
# - trait names that contain "conditional" or
# - trait names that contain " or " to remove composite traits,
# - traits with 'Uncategorized' trait category
# - traits with 'Phenotype' trait category
filtered_l2g = filtered_l2g.query(
    "not trait_reported.str.contains(' or |conditional| and | x |pleiotropy', case=False) "
    "and not trait_reported.str.contains('EA', case=True)"
    "and trait_category != 'Uncategorised'"
    "and trait_category != 'phenotype'"
)

# Retain the GWAS with the largest sample size by number of cases by EFO group
best_gwas = filtered_l2g.loc[filtered_l2g.groupby('trait_efos')['n_cases'].idxmax()]["study_id"]
filtered_l2g = filtered_l2g[filtered_l2g['study_id'].isin(best_gwas)]

#rename the y_proba_full_model to L2G
filtered_l2g.rename(columns={'y_proba_full_model': 'L2G'}, inplace=True)

#remove rows in the MHC region, as these are often mapped incorrectly
mhc_start = 28510120
mhc_end = 33480577

# Filter positions outside the extended MHC region
filtered_l2g = filtered_l2g[~((filtered_l2g['chrom'] == '6') & 
                                      (filtered_l2g['pos'] >= mhc_start) & 
                                      (filtered_l2g['pos'] <= mhc_end))]

#sort by GWAS and P-value and L2G score
filtered_l2g = filtered_l2g.sort_values(by=['trait_category', 'study_id', 'pval', 'L2G'],
                                        ascending=[True, True, True, False])

In [6]:
#filter down the columns to make this df a bit easier to work with
desired_columns = ['trait_category', 'trait_efos', 'trait_reported',
                   'gene_name', 'L2G',
                   'chrom', 'pos', 'ref', 'alt', 'pval',
                   'source', 'study_id', 'pmid',
                   'pub_date', 'num_assoc_loci']
filtered_l2g = filtered_l2g[desired_columns]

In [7]:
filtered_l2g

Unnamed: 0,trait_category,trait_efos,trait_reported,gene_name,L2G,chrom,pos,ref,alt,pval,source,study_id,pmid,pub_date,num_assoc_loci
37034,biological process,GO_0036273,Statin medication,LDLR,0.533313,19,11087826,T,C,3.920000e-188,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
40060,biological process,GO_0036273,Statin medication,PCSK9,0.763768,1,55039974,G,T,7.940000e-166,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
40599,biological process,GO_0036273,Statin medication,BCAM,0.745231,19,44816374,G,A,8.610000e-148,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
40985,biological process,GO_0036273,Statin medication,USP24,0.547708,1,55293465,A,G,2.290000e-131,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
36445,biological process,GO_0036273,Statin medication,ANKRD17,0.880390,4,73167847,G,C,7.700000e-108,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34474,urinary system disease,MONDO_0024647,Urolithiasis,HCRTR2,0.714569,6,55168400,T,C,2.000000e-08,GCST,GCST90018935,PMID:34594039,2021-09-30,33
35014,urinary system disease,MONDO_0024647,Urolithiasis,MYOCD,0.425137,17,12363785,C,T,2.000000e-08,GCST,GCST90018935,PMID:34594039,2021-09-30,33
35361,urinary system disease,MONDO_0024647,Urolithiasis,MAP2K4,0.389578,17,12363785,C,T,2.000000e-08,GCST,GCST90018935,PMID:34594039,2021-09-30,33
34612,urinary system disease,MONDO_0024647,Urolithiasis,ZNF18,0.289373,17,12363785,C,T,2.000000e-08,GCST,GCST90018935,PMID:34594039,2021-09-30,33


In [8]:
filtered_l2g.to_csv('gwas_data/OpenTargets_L2G_Filtered.csv', index=False)