# Filter OpenTargets L2G Query Results

The purpose of this file is to show how to cleanup the OpenTarget query results to get a more refined, high-quality set of GWAS phenotypes to use for downstream program enrichment analysis.

This outputs `OpenTargets_L2G_Filtered.csv`, which is what we use to perform GWAS-related benchmarking on programs based on MuData input objects.  This does not need to be re-run unless changes to the GWAS filtering steps are desired.

In [1]:
# Basic Imports
import pandas as pd
import numpy as np

In [2]:
# Read in the query results
l2g = pd.read_csv("/home/robertg1/gene_program_evaluation/smk/resources/OpenTargets_L2G_noQC.csv.gz", compression='gzip')
l2g.head()

  l2g = pd.read_csv("/home/robertg1/gene_program_evaluation/smk/resources/OpenTargets_L2G_noQC.csv.gz", compression='gzip')


Unnamed: 0,study_id,chrom,pos,ref,alt,study_id_1,ancestry_initial,ancestry_replication,n_cases,n_initial,...,has_sumstats,num_assoc_loci,source,trait_reported,trait_efos,trait_category,gene_name,y_proba_full_model,pval,rn
0,FINNGEN_R6_ASTHMA_HOSPITAL_MAIN,15,67150258,C,T,FINNGEN_R6_ASTHMA_HOSPITAL_MAIN,{'list': array([{'element': 'European=260405'}...,"{'list': array([], dtype=object)}",21351.0,260405,...,True,17,FINNGEN,"""Asthma, hospital admissions , main diagnosis ...","{'list': array([{'element': 'EFO_0000270'}], d...",Uncategorised,SMAD3,0.863717,4e-15,1
1,FINNGEN_R6_ATOPIC_STRICT,1,160515662,C,T,FINNGEN_R6_ATOPIC_STRICT,{'list': array([{'element': 'European=260078'}...,"{'list': array([], dtype=object)}",6412.0,260078,...,True,20,FINNGEN,"""Atopic dermatitis, strict definition""","{'list': array([{'element': 'EFO_0000274'}], d...",integumentary system disease,SLAMF1,0.204347,4.15e-08,1
2,FINNGEN_R6_AUTOIMMUNE,10,6049054,T,A,FINNGEN_R6_AUTOIMMUNE,{'list': array([{'element': 'European=260405'}...,"{'list': array([], dtype=object)}",52657.0,260405,...,True,68,FINNGEN,Autoimmune diseases,"{'list': array([{'element': 'EFO_0005140'}], d...",immune system disease,IL2RA,0.86339,2.75e-16,1
3,FINNGEN_R6_AUTOIMMUNE,9,124267351,A,G,FINNGEN_R6_AUTOIMMUNE,{'list': array([{'element': 'European=260405'}...,"{'list': array([], dtype=object)}",52657.0,260405,...,True,68,FINNGEN,Autoimmune diseases,"{'list': array([{'element': 'EFO_0005140'}], d...",immune system disease,NEK6,0.625163,1.84e-16,1
4,FINNGEN_R6_AUTOIMMUNE,6,137678425,GA,G,FINNGEN_R6_AUTOIMMUNE,{'list': array([{'element': 'European=260405'}...,"{'list': array([], dtype=object)}",52657.0,260405,...,True,68,FINNGEN,Autoimmune diseases,"{'list': array([{'element': 'EFO_0005140'}], d...",immune system disease,OLIG3,0.663875,4.05e-10,1


In [3]:
def process_json_format_l2g_columns(row, column_name):
    """
    Extracts a comma-separated list of IDs from a string representation of a list of dictionaries in a DataFrame row.

    Parameters:
        row (pandas.Series): A row of a pandas DataFrame.
        column_name (str): The name of the column containing the string representation.

    Returns:
        str: A comma-separated list of IDs extracted from the string representation.
             Returns None if the string representation is not properly formatted or if an error occurs during extraction.
    """
    try:
        elements_str = row[column_name]
        start_index = elements_str.find("[{")  # Find the starting index of the list
        end_index = elements_str.find("}]") + 2  # Find the ending index of the list and include the closing bracket
        
        if start_index != -1 and end_index != -1:  # Ensure both start and end indices are found
            elements_list_str = elements_str[start_index:end_index]
            
            # Remove newline characters and convert to a list of dictionaries
            elements_list = eval(elements_list_str.replace('\n', '').replace('array', 'list'))

            ids = [elem['element'] for elem in elements_list]
            ids.sort()
            return ', '.join(ids)
        else:
            return None  # Return None if start or end index not found
    except Exception as e:
        print(f"Error: {e}, Row: {row}")
        return None

In [4]:
# convert the trait_efos into just a flat list of EFO IDs rather than json formatted
l2g['trait_efos'] = l2g.apply(lambda row: process_json_format_l2g_columns(row, "trait_efos"), axis=1)

In [5]:
l2g['trait_efos']

0         EFO_0000270
1         EFO_0000274
2         EFO_0005140
3         EFO_0005140
4         EFO_0005140
             ...     
266799     HP_0000023
266800    EFO_1001460
266801           None
266802    EFO_0004799
266803    EFO_0002506
Name: trait_efos, Length: 266804, dtype: object

In [31]:

def process_json_format_l2g_columns(row, column_name):
    """
    Extracts a comma-separated list of IDs from a string representation of a list of dictionaries in a DataFrame row.

    Parameters:
        row (pandas.Series): A row of a pandas DataFrame.
        column_name (str): The name of the column containing the string representation.

    Returns:
        str: A comma-separated list of IDs extracted from the string representation.
             Returns None if the string representation is not properly formatted or if an error occurs during extraction.
    """
    try:
        elements_str = row[column_name]
        start_index = elements_str.find("[{")  # Find the starting index of the list
        end_index = elements_str.find("}]") + 2  # Find the ending index of the list and include the closing bracket
        
        if start_index != -1 and end_index != -1:  # Ensure both start and end indices are found
            elements_list_str = elements_str[start_index:end_index]
            
            # Remove newline characters and convert to a list of dictionaries
            elements_list = eval(elements_list_str.replace('\n', '').replace('array', 'list'))

            ids = [elem['element'] for elem in elements_list]
            ids.sort()
            return ', '.join(ids)
        else:
            return None  # Return None if start or end index not found
    except Exception as e:
        print(f"Error: {e}, Row: {row}")
        return None

l2g = pd.read_csv("/home/robertg1/gene_program_evaluation/smk/resources/OpenTargets_L2G_noQC.csv.gz", compression='gzip')

# convert the trait_efos into just a flat list of EFO IDs rather than json formatted
l2g['trait_efos'] = l2g.apply(lambda row: process_json_format_l2g_columns(row, "trait_efos"), axis=1)

# Remove GWAS with lots of EFO IDs
filtered_l2g = l2g[l2g['trait_efos'].fillna('').apply(lambda x: x.count(',')) <= 2]

#remove double quotes present in some trait_reported rows, but not others
filtered_l2g.loc[:, 'trait_reported'] = filtered_l2g['trait_reported'].str.replace('"', '')

# Remove unusual traits, such as
# - trait names that contain "conditional" or
# - trait names that contain " or " to remove composite traits,
# - traits with 'Uncategorized' trait category
# - traits with 'Phenotype' trait category
filtered_l2g = filtered_l2g.query(
    "not trait_reported.str.contains(' or |conditional| and | x |pleiotropy', case=False) "
    "and not trait_reported.str.contains('EA', case=True)"
    "and trait_category != 'Uncategorised'"
    "and trait_category != 'phenotype'"
)

# Retain the GWAS with the largest sample size by number of cases by EFO group
filtered_l2g['n_cases'] = filtered_l2g['n_cases'].fillna(filtered_l2g['n_initial'])
filtered_l2g = (filtered_l2g.assign(rank=filtered_l2g.groupby(["trait_efos"])["n_cases"].rank(method="min", ascending=False))
             .query("rank == 1")
             .drop(columns=["rank"])
             .reset_index(drop=True))

#rename the y_proba_full_model to L2G
filtered_l2g.rename(columns={'y_proba_full_model': 'L2G'}, inplace=True)

# Filter positions outside the extended MHC region
mhc_start = 28510120
mhc_end = 33480577
filtered_l2g = filtered_l2g[~((filtered_l2g['chrom'] == '6') & 
                                      (filtered_l2g['pos'] >= mhc_start) & 
                                      (filtered_l2g['pos'] <= mhc_end))]

#filter down the columns to make this df a bit easier to work with
desired_columns = ['trait_category', 'trait_efos', 'trait_reported',
                   'gene_name', 'L2G',
                   'chrom', 'pos', 'ref', 'alt', 'pval',
                   'source', 'study_id', 'pmid',
                   'pub_date', 'num_assoc_loci']
filtered_l2g = filtered_l2g[desired_columns]

filtered_l2g = filtered_l2g.sort_values(by=['trait_category', 'study_id', 'pval', 'L2G'],
                                        ascending=[True, True, True, False])

filtered_l2g.to_csv('gwas_data/OpenTargets_L2G_Filtered.csv', index=False)

Unnamed: 0,trait_category,trait_efos,trait_reported,gene_name,L2G,chrom,pos,ref,alt,pval,source,study_id,pmid,pub_date,num_assoc_loci
69242,biological process,GO_0036273,Statin medication,LDLR,0.533313,19,11087826,T,C,3.920000e-188,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
42101,biological process,GO_0036273,Statin medication,PCSK9,0.763768,1,55039974,G,T,7.940000e-166,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
34208,biological process,GO_0036273,Statin medication,BCAM,0.745231,19,44816374,G,A,8.610000e-148,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
26922,biological process,GO_0036273,Statin medication,USP24,0.547708,1,55293465,A,G,2.290000e-131,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
20583,biological process,GO_0036273,Statin medication,ANKRD17,0.880390,4,73167847,G,C,7.700000e-108,FINNGEN,FINNGEN_R6_RX_STATIN,,2022-01-24,77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77690,urinary system disease,MONDO_0024647,Urolithiasis,HCRTR2,0.714569,6,55168400,T,C,2.000000e-08,GCST,GCST90018935,PMID:34594039,2021-09-30,33
62562,urinary system disease,MONDO_0024647,Urolithiasis,MYOCD,0.425137,17,12363785,C,T,2.000000e-08,GCST,GCST90018935,PMID:34594039,2021-09-30,33
50611,urinary system disease,MONDO_0024647,Urolithiasis,MAP2K4,0.389578,17,12363785,C,T,2.000000e-08,GCST,GCST90018935,PMID:34594039,2021-09-30,33
73820,urinary system disease,MONDO_0024647,Urolithiasis,ZNF18,0.289373,17,12363785,C,T,2.000000e-08,GCST,GCST90018935,PMID:34594039,2021-09-30,33


In [None]:
# Retain the GWAS with the largest sample size by number of cases by EFO group

best_gwas = filtered_l2g.loc[filtered_l2g.groupby('trait_efos')['n_cases'].idxmax()]["study_id"]
filtered_l2g = filtered_l2g[filtered_l2g['study_id'].isin(best_gwas)]

#rename the y_proba_full_model to L2G
filtered_l2g.rename(columns={'y_proba_full_model': 'L2G'}, inplace=True)

#remove rows in the MHC region, as these are often mapped incorrectly
mhc_start = 28510120
mhc_end = 33480577

# Filter positions outside the extended MHC region
filtered_l2g = filtered_l2g[~((filtered_l2g['chrom'] == '6') & 
                                      (filtered_l2g['pos'] >= mhc_start) & 
                                      (filtered_l2g['pos'] <= mhc_end))]

#sort by GWAS and P-value and L2G score
filtered_l2g = filtered_l2g.sort_values(by=['trait_category', 'study_id', 'pval', 'L2G'],
                                        ascending=[True, True, True, False]

In [None]:
#filter down the columns to make this df a bit easier to work with
desired_columns = ['trait_category', 'trait_efos', 'trait_reported',
                   'gene_name', 'L2G',
                   'chrom', 'pos', 'ref', 'alt', 'pval',
                   'source', 'study_id', 'pmid',
                   'pub_date', 'num_assoc_loci']
filtered_l2g = filtered_l2g[desired_columns]

In [32]:
import pandas as pd

def process_json_format_l2g_columns(row, column_name):
    """
    Extracts a comma-separated list of IDs from a string representation of a list of dictionaries in a DataFrame row.

    Parameters:
        row (pandas.Series): A row of a pandas DataFrame.
        column_name (str): The name of the column containing the string representation.

    Returns:
        str: A comma-separated list of IDs extracted from the string representation.
             Returns None if the string representation is not properly formatted or if an error occurs during extraction.
    """
    try:
        elements_str = row[column_name]
        start_index = elements_str.find("[{")  # Find the starting index of the list
        end_index = elements_str.find("}]") + 2  # Find the ending index of the list and include the closing bracket
        
        if start_index != -1 and end_index != -1:  # Ensure both start and end indices are found
            elements_list_str = elements_str[start_index:end_index]
            
            # Remove newline characters and convert to a list of dictionaries
            elements_list = eval(elements_list_str.replace('\n', '').replace('array', 'list'))

            ids = [elem['element'] for elem in elements_list]
            ids.sort()
            return ', '.join(ids)
        else:
            return None  # Return None if start or end index not found
    except Exception as e:
        print(f"Error: {e}, Row: {row}")
        return None

def filter_open_targets_gwas_query(input_file, output_file, min_l2g_score=None, remove_mhc_region=True):
    l2g = pd.read_csv(input_file, compression='gzip')
    
    # Convert the trait_efos into just a flat list of EFO IDs rather than json formatted
    l2g['trait_efos'] = l2g.apply(lambda row: process_json_format_l2g_columns(row, "trait_efos"), axis=1)

    # Remove GWAS with lots of EFO IDs
    filtered_l2g = l2g[l2g['trait_efos'].fillna('').apply(lambda x: x.count(',')) <= 2]

    # Remove double quotes present in some trait_reported rows, but not others
    filtered_l2g.loc[:, 'trait_reported'] = filtered_l2g['trait_reported'].str.replace('"', '')

    # Remove unusual traits
    filtered_l2g = filtered_l2g.query(
        "not trait_reported.str.contains(' or |conditional| and | x |pleiotropy', case=False) "
        "and not trait_reported.str.contains('EA', case=True)"
        "and trait_category != 'Uncategorised'"
        "and trait_category != 'phenotype'"
    )

    # Retain the GWAS with the largest sample size by number of cases by EFO group
    filtered_l2g['n_cases'] = filtered_l2g['n_cases'].fillna(filtered_l2g['n_initial'])
    filtered_l2g = (filtered_l2g.assign(rank=filtered_l2g.groupby(["trait_efos"])["n_cases"].rank(method="min", ascending=False))
                 .query("rank == 1")
                 .drop(columns=["rank"])
                 .reset_index(drop=True))

    # Rename the y_proba_full_model to L2G
    filtered_l2g.rename(columns={'y_proba_full_model': 'L2G'}, inplace=True)

    # Filter positions outside the extended MHC region
    if remove_mhc_region:
        mhc_start = 28510120
        mhc_end = 33480577
        filtered_l2g = filtered_l2g[~((filtered_l2g['chrom'] == '6') & 
                                              (filtered_l2g['pos'] >= mhc_start) & 
                                              (filtered_l2g['pos'] <= mhc_end))]

    # Filter rows based on min L2G score
    if min_l2g_score is not None:
        filtered_l2g = filtered_l2g[filtered_l2g['L2G'] >= min_l2g_score]

    # Filter down the columns to make this DataFrame a bit easier to work with
    desired_columns = ['trait_category', 'trait_efos', 'trait_reported',
                       'gene_name', 'L2G',
                       'chrom', 'pos', 'ref', 'alt', 'pval',
                       'source', 'study_id', 'pmid',
                       'pub_date', 'num_assoc_loci']
    filtered_l2g = filtered_l2g[desired_columns]

    # Sort the DataFrame
    filtered_l2g = filtered_l2g.sort_values(by=['trait_category', 'study_id', 'pval', 'L2G'],
                                            ascending=[True, True, True, False])

    # Save the filtered DataFrame to a CSV file
    filtered_l2g.to_csv(output_file, index=False)

# Example usage:
filter_open_targets_gwas_query(
    input_file="/home/robertg1/gene_program_evaluation/smk/resources/OpenTargets_L2G_noQC.csv.gz",
    output_file='gwas_data/OpenTargets_L2G_Filtered.csv',
    min_l2g_score=None,
    remove_mhc_region=True  
)


  l2g = pd.read_csv(input_file, compression='gzip')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_l2g['n_cases'] = filtered_l2g['n_cases'].fillna(filtered_l2g['n_initial'])


In [None]:
filtered_l2g.to_csv('gwas_data/OpenTargets_L2G_Filtered.csv', index=False)