## Notebook for scan the meta-analysis eQTL results for PD risk and colocalization

In [1]:
!date

Mon Sep 23 17:14:19 EDT 2024


#### import libraries

In [2]:
from pandas import read_csv, DataFrame, Series, merge, concat, read_parquet
from numpy import around
import colocalization as clc
from multiprocessing import Process

#### set notebook variables

In [3]:
# naming
set_name_frmt = 'foundin_daNA_{modality}'
dx = 'PD'

# directories 
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
public_dir = f'{wrk_dir}/public'
meta_dir = f'{wrk_dir}/meta'
results_dir = f'{wrk_dir}/results'

# input files
# with agreement in place use full summary stats
# gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_no23andme_buildGRCh38.tsv.gz'
gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_23andme_buildGRCh38.tsv.gz'
index_variants_file = f'{public_dir}/nalls_pd_gwas/index_variants.list'  

# variables
DEBUG = False
max_nominal = 0.01
alpha = 0.05
modalities = ['Bulk-meta', 'DAn-meta']
min_h4 = 0.5

#### functions

In [4]:
def load_meta_eqtl_results(in_dir: str, name: str) -> DataFrame:
    qtl_full_file = f'{in_dir}/{name}_metal_eqtl.parquet'
    qtl_full_df = read_parquet(qtl_full_file)
    print(f'shape of {name} results {qtl_full_df.shape}')
    # rename variant col so consistent with gwas
    qtl_full_df = qtl_full_df.rename(columns={'variant': 'variant_id', 'P-value': 'p_value'})
    # load the qtl tops and only keep full results for QTL detected by FDR
    top_file = f'{in_dir}/{name}_metal_eqtl_top.csv'
    tops_df = read_csv(top_file)
    tops_df = tops_df.loc[tops_df.bh_fdr <= alpha]
    qtl_full_df = qtl_full_df.loc[qtl_full_df.trait.isin(tops_df.trait)]
    number_genes = qtl_full_df.trait.nunique()
    print(f'full qtl for {number_genes} features for {name}')
    if DEBUG:
        display(qtl_full_df.head())
    return qtl_full_df

def find_intersecing_traits(df: DataFrame, variants: list, name: str) -> list:
    # find traits tested against variants of interest with sufficicent p-value
    oi_results = df.loc[(df.variant_id.isin(variants)) & 
                        (df.p_value <= max_nominal)]
    print(f'{name} intersect shape {oi_results.shape}')
    traits_oi = list(oi_results.trait.unique())
    print(f'{name} number of traits {len(traits_oi)}')
    if DEBUG:
        print(name, traits_oi)
    return traits_oi

def process_qtl(trait_df: DataFrame, other_stats: DataFrame) -> DataFrame:
    """ Prep the QTL (or trait2) results for use in colocalization. 
        Where prep performs Wakefield Approx Bayes Factor, posterior probabliltiy,
        and credible sets calculations and identification
    Args:
        trait_df (pandas.DataFrame) QTL results for a feature
        other_stats (pandas.DataFrame) trait1 (or risk) stats to be used with these
            qtl (or trait2) stats for colocalization
    Returns:
        (pandas.DataFrame) qtl results with ABF, PP, and credible sets computed
    """
    # some feature QTL stats may also have multiple results per variants 
    # so need to reduce or remove these
    # these are typically a results of variants that are multi-allelic like indels
    trait_df = trait_df.drop_duplicates(subset=['variant_id'], keep='first')
    # calculate the ABF's for the feature's QTL results
    trait_df["logABF"] = trait_df.apply(
    lambda result: clc.calc_abf(pval=result.p_value, maf=clc.freq_to_maf(result.maf),
                                n=int(result.Weight)), 
                                axis=1)    
    trait_df = trait_df.sort_values("logABF", ascending=False)
    # calculate the posterior probability for each variant
    trait_df['PP'] = clc.compute_pp(trait_df.logABF)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(trait_df)
    # subset the feature QTL variants to just those present in the GWAS
    trait_df = trait_df.loc[trait_df.variant_id.isin(other_stats.variant_id)] 
    return trait_df

def process_gwas(trait_stats: DataFrame, other_stats: DataFrame) -> DataFrame:
    """ Prep the risk (or trait1) results for use in colocalization. 
        Where prep performs subet of variant to those present in other (trait2/qtl),
        Wakefield Approx Bayes Factor, posterior probabliltiy,
        and credible sets calculations and identification
    Args:
        trait_stats (pandas.DataFrame) trait1 (or risk) stats to be used with these
            qtl (or trait2) stats for colocalization
        other_stats (pandas.DataFrame) trait2 (or qtl) stats to be used with these
            risk (or trait2) stats for colocalization
    Returns:
        (pandas.DataFrame) risk results with ABF, PP, and credible sets computed
    """ 
    # subset the risk summary stats by the feature's QTL variants present
    ret_df = trait_stats.loc[trait_stats.variant_id.isin(other_stats.variant_id)].copy()
    # calculate the ABF's for the risk results
    ret_df['logABF'] = ret_df.apply(
        lambda result: clc.calc_abf(pval=result.p_value, 
                                    maf=clc.freq_to_maf(result.effect_allele_frequency),
                                    n=result.n_total, 
                                    prop_cases=result.case_prop), axis=1)
    ret_df = ret_df.sort_values('logABF', ascending=False)  
    # calculate the posterior probability for each variant
    ret_df['PP'] = clc.compute_pp(ret_df.logABF)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(ret_df)
    return ret_df

def ensure_matched_indices(df1: DataFrame, df2: DataFrame) -> {DataFrame, DataFrame}:
    """ make sure the two datasets are ordered the same
        modifies both df1 and df2
    Args:
        df1 (pandas.DataFrame) risk or trait1 data
        df2 (pandas.DataFrame) qtl or trait2 data
    """ 
    # ensure that the risk and feature variants ABF's are ordered the same
    df1 = df1.set_index('variant_id')
    df2 = df2.set_index('variant_id')
    shared_indices = df1.index.intersection(df2.index)
    df1 = df1.loc[shared_indices,]
    df2 = df2.loc[shared_indices,]
    return df1, df2

def colocalize(t1_abfs, t2_abfs, feature: str) -> Series:
    """ Perform the colocalization between trait1 and trait2 ABFs
    Args:
        t1_abfs (array_like) trait1's ABFs
        t2_abfs (array_like) trait2's ABFs
        feature (string) trait2's name or ID
    Returns:
        (pandas.Series) named colocalization posterior probabilities
    """
    h_probs = clc.combine_abf(t1_abfs, t2_abfs)
    names = [f'H{x}' for x in range(5)]
    cl_result = Series(data=around(h_probs, decimals=3), index=names)
    cl_result['feature'] = feature
    return cl_result  

def compute_combined_pp(t1_df: DataFrame, t2_df: DataFrame) -> DataFrame:
    """ Compute the the combined ABFs posterior probabilities and credible sets
    Args:
        t1_df (pandas.DataFrame) risk or trait1's data
        t2_df (pandas.DataFrame) qtl or trait2's data
    Returns:
        (pandas.DataFrame) t1_df combined with t2_df with PP and credible sets ID'd
    """
    ret_df = merge(t1_df, t2_df, how='inner', on='variant_id', suffixes=('_risk', '_qtl'))
    # calculate the posterior probability for each variant
    ret_df['PP'] = clc.compute_pp(ret_df.logABF_risk + ret_df.logABF_qtl)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(ret_df)
    ret_df = ret_df.rename(columns={'PP': 'h4_pp'})
    return ret_df

def process_modality(name: str, in_dir: str, variants: list, 
                     out_dir: str, gwas_stats_df: DataFrame, dx: str):
    qtl_full_df = load_meta_eqtl_results(in_dir, name)
    traits_oi = find_intersecing_traits(qtl_full_df, variants, name)
    coloc_scores = []
    coloc_h4_pps = None
    support_cnt = 0
    # process data by chromosome
    for feature in traits_oi:
        feature_df = qtl_full_df.loc[qtl_full_df.trait == feature].copy()
        feature_df = process_qtl(feature_df, gwas_stats_df)
        # prep the GWAS results
        risk_df = process_gwas(gwas_stats_df, feature_df)
        # ensure that the risk and feature variants ABF's are ordered the same
        risk_df, feature_df = ensure_matched_indices(risk_df, feature_df)
        # perform the colocalization
        cl_result = colocalize(risk_df.logABF, feature_df.logABF, feature)
        # if H4 is supported then compute H4.PP the H4 credible sets
        cl_result['h4_supported'] = clc.h4_supported(cl_result)
        if cl_result.H4 >= min_h4:
            combined_df = compute_combined_pp(risk_df, feature_df)
            coloc_h4_pps = concat([coloc_h4_pps, combined_df])
            support_cnt += 1
        # add these scores to the rest
        coloc_scores.append(cl_result)
    # create a dataframe from the list of coloc scores    
    coloc_scores_df = DataFrame(coloc_scores)
    ### save the result files for this cell-type
    coloc_scores_files = f'{out_dir}/{name}_{dx}.coloc.pp.csv'
    coloc_casuals_files = f'{out_dir}/{name}_{dx}.casuals.pp.parquet'
    coloc_scores_df.to_csv(coloc_scores_files, index=False)
    if not coloc_h4_pps is None:
        coloc_h4_pps.to_parquet(coloc_casuals_files)
        print(f'{name} found H4 support at {support_cnt} traits')
    else:
        print(f'{name} no H4 supported so no coloc_h4_pps')        

### load the input data

#### load the risk variants of interest

In [5]:
%%time
variants_oi_df = read_csv(index_variants_file)
print(variants_oi_df.shape)
index_variants = list(variants_oi_df.variant.unique())
if DEBUG:
    display(variants_oi_df.head())
    print(index_variants)

(91, 1)
CPU times: user 5.38 ms, sys: 54 µs, total: 5.44 ms
Wall time: 4.9 ms


#### load the full gwas summary stats

In [6]:
%%time
gwas_stats_df = read_csv(gwas_sum_stats_file, sep='\t')
print(gwas_stats_df.shape)
if DEBUG:
    display(gwas_stats_df.sample(5))

(7769022, 12)
CPU times: user 10.8 s, sys: 796 ms, total: 11.6 s
Wall time: 11.6 s


#### set case proportion for GWAS summary stats

In [7]:
gwas_stats_df['n_total'] = gwas_stats_df.n_cases + gwas_stats_df.n_controls
    
gwas_stats_df['case_prop'] = gwas_stats_df.n_cases / gwas_stats_df.n_total

#### subset index variant stats

In [8]:
index_stats_df = gwas_stats_df.loc[gwas_stats_df.variant_id.isin(index_variants)]
print(index_stats_df.shape)
if DEBUG:
    display(index_stats_df.head())

(88, 13)


### load data, format data, and save new input files

#### meta eQTL result

#### load and analyze meta eQTL results

In [9]:
%%time
jobs = {}
for modality in modalities:
    set_name = set_name_frmt.format(modality=modality)
    print(f'#### {modality, set_name} ####')
    p = Process(target=process_modality, 
                args=(set_name, meta_dir, index_variants, results_dir,
                      gwas_stats_df, dx))
    p.start()
    # Append process and key to keep track
    jobs[modality] = p    
# Wait for all processes to finish
for key, p in jobs.items():
    p.join()     

#### ('Bulk-meta', 'foundin_daNA_Bulk-meta') ####
#### ('DAn-meta', 'foundin_daNA_DAn-meta') ####
shape of foundin_daNA_Bulk-meta results (17569095, 12)
shape of foundin_daNA_DAn-meta results (17598904, 12)
full qtl for 4467 features for foundin_daNA_DAn-meta
foundin_daNA_DAn-meta intersect shape (33, 12)
foundin_daNA_DAn-meta number of traits 32
full qtl for 5753 features for foundin_daNA_Bulk-meta
foundin_daNA_Bulk-meta intersect shape (43, 12)
foundin_daNA_Bulk-meta number of traits 42
foundin_daNA_DAn-meta found H4 support at 17 traits
foundin_daNA_Bulk-meta found H4 support at 24 traits
CPU times: user 22.5 ms, sys: 39.2 ms, total: 61.7 ms
Wall time: 4min 5s


In [10]:
!date

Mon Sep 23 17:18:38 EDT 2024
