## Notebook to run the colocalization analyses between cell-type differentiation based QTL and disease risk loci

### From [coloc docs](https://chr1swallace.github.io/coloc/articles/a03_enumeration.html) for single causal variant assumption, posterior probabilities that the traits share their configurations
𝐻0: neither trait has a genetic association in the region<br>
𝐻1: only trait 1 has a genetic association in the region<br>
𝐻2: only trait 2 has a genetic association in the region<br>
𝐻3: both traits are associated, but with different causal variants<br>
𝐻4: both traits are associated and share a single causal variant<br>

In [None]:
!date

#### import libraries

In [None]:
import colocalization as clc
from pandas import read_csv, DataFrame, read_parquet, Series, merge, concat
from numpy import around

#### set notebook variables

In [None]:
# parameters
day = ''
modality = ''
num_qtl_samples = 0
file_type = '' # tensorqtl or metal+

In [None]:
# naming
cohort = 'foundin'
dx = 'PD'
if file_type == 'tensorqtl':
    set_name = f'{cohort}_{day}_{modality}'
elif file_type == 'metal+':
    set_name = f'{day}'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
results_dir = f'{wrk_dir}/results'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
meta_dir = f'{wrk_dir}/meta'
public_dir = f'{wrk_dir}/public'

# input files
shared_prelim_file = f'{results_dir}/{set_name}_{dx}.prelim_shared.cis.csv'
# if agreement in place use summary stats that include 23andMe data
gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_23andme_buildGRCh38.tsv.gz'
# gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_no23andme_buildGRCh38.tsv.gz'
index_variants_file = f'{public_dir}/nalls_pd_gwas/index_variants.list'    

# output files
coloc_scores_files = f'{results_dir}/{set_name}_{dx}.coloc.pp.csv'
coloc_casuals_files = f'{results_dir}/{set_name}_{dx}.casuals.pp.parquet'

# constant values
DEBUG = False
min_h4 = 0.5

#### utility functions

In [None]:
def load_qtl_results(chrom: str, in_dir: str, name: str, feature: str, 
                     verbose: bool=False) -> DataFrame:
    """ Load the tensorQTL results for the feature specified from the 
        tensorQTL cis.map_nominal chromosome results parquet file
    Args:
        chrom (string) chromosome of feature and results
        in_dir (string) directory name containing tensorQTL results
        name (string) analysis set name (prefix) of tensorQTL results
        feature (string) feature to load results for; ie gene, peak, etc
        verbose (bool) show shape and head of loaded feature results, default=False
    Returns:
        (pandas.DataFrame) features cis.map_nominal tensorQTL results
    """
    chrom_file = f'{in_dir}/{name}.cis_qtl_pairs.chr{chrom}.parquet'
    chrom_qtl_df = read_parquet(chrom_file)
    chrom_qtl_df = chrom_qtl_df.rename(columns={'phenotype_id': 'trait'})
    feature_qtl_df = chrom_qtl_df.loc[chrom_qtl_df.trait == feature]
    # make sure there aren't any bad values in the result
    feature_qtl_df = feature_qtl_df.loc[~feature_qtl_df.pval_nominal.isna()]
    if verbose:
        print(f'{feature} has {feature_qtl_df.shape} results')
        display(feature_qtl_df.head())
    return feature_qtl_df
    
def process_qtl(trait_df: DataFrame, other_stats: DataFrame, num_samples: int) -> DataFrame:
    """ Prep the QTL (or trait2) results for use in colocalization. 
        Where prep performs Wakefield Approx Bayes Factor, posterior probabliltiy,
        and credible sets calculations and identification
    Args:
        trait_df (pandas.DataFrame) QTL results for a feature
        other_stats (pandas.DataFrame) trait1 (or risk) stats to be used with these
            qtl (or trait2) stats for colocalization
        num_samples (int) number of samples used in for the qtl analysis, if set to 0 then
            number of samples is present in results per variant
    Returns:
        (pandas.DataFrame) qtl results with ABF, PP, and credible sets computed
    """
    # some feature QTL stats may also have multiple results per variants 
    # so need to reduce or remove these
    # these are typically a results of variants that are multi-allelic like indels
    trait_df = trait_df.drop_duplicates(subset=['variant_id'], keep='first').copy()
    # calculate the ABF's for the feature's QTL results
    trait_df['logABF'] = trait_df.apply(
    lambda result: clc.calc_abf(pval=result.pvalue, maf=clc.freq_to_maf(result.af),
                                n=num_samples if num_samples != 0 else int(result.num_samples)), 
                                axis=1)    
    trait_df = trait_df.sort_values("logABF", ascending=False)
    # calculate the posterior probability for each variant
    trait_df['PP'] = clc.compute_pp(trait_df.logABF)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(trait_df)
    # subset the feature QTL variants to just those present in the GWAS
    trait_df = trait_df.loc[trait_df.variant_id.isin(other_stats.variant_id)] 
    return trait_df

def process_gwas(trait_stats: DataFrame, other_stats: DataFrame) -> DataFrame:
    """ Prep the risk (or trait1) results for use in colocalization. 
        Where prep performs subet of variant to those present in other (trait2/qtl),
        Wakefield Approx Bayes Factor, posterior probabliltiy,
        and credible sets calculations and identification
    Args:
        trait_stats (pandas.DataFrame) trait1 (or risk) stats to be used with these
            qtl (or trait2) stats for colocalization
        other_stats (pandas.DataFrame) trait2 (or qtl) stats to be used with these
            risk (or trait2) stats for colocalization
    Returns:
        (pandas.DataFrame) risk results with ABF, PP, and credible sets computed
    """ 
    # subset the risk summary stats by the feature's QTL variants present
    ret_df = trait_stats.loc[trait_stats.variant_id.isin(other_stats.variant_id)].copy()
    # calculate the ABF's for the risk results
    ret_df['logABF'] = ret_df.apply(
        lambda result: clc.calc_abf(pval=result.p_value, 
                                    maf=clc.freq_to_maf(result.effect_allele_frequency),
                                    n=result.n_total, 
                                    prop_cases=result.case_prop), axis=1)
    ret_df = ret_df.sort_values('logABF', ascending=False)  
    # calculate the posterior probability for each variant
    ret_df['PP'] = clc.compute_pp(ret_df.logABF)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(ret_df)
    return ret_df

def ensure_matched_indices(df1: DataFrame, df2: DataFrame) -> {DataFrame, DataFrame}:
    """ make sure the two datasets are ordered the same
        modifies both df1 and df2
    Args:
        df1 (pandas.DataFrame) risk or trait1 data
        df2 (pandas.DataFrame) qtl or trait2 data
    """ 
    # ensure that the risk and feature variants ABF's are ordered the same
    df1 = df1.set_index('variant_id')
    df2 = df2.set_index('variant_id')
    print('reindexing')
    shared_indices = df1.index.intersection(df2.index)
    df1 = df1.loc[shared_indices,]
    df2 = df2.loc[shared_indices,]
    temp = df1.index.values == df2.index.values
    display(Series(temp).value_counts())
    return df1, df2
        
def colocalize(t1_abfs, t2_abfs, feature: str) -> Series:
    """ Perform the colocalization between trait1 and trait2 ABFs
    Args:
        t1_abfs (array_like) trait1's ABFs
        t2_abfs (array_like) trait2's ABFs
        feature (string) trait2's name or ID
    Returns:
        (pandas.Series) named colocalization posterior probabilities
    """
    h_probs = clc.combine_abf(t1_abfs, t2_abfs)
    names = [f'H{x}' for x in range(5)]
    cl_result = Series(data=around(h_probs, decimals=3), index=names)
    cl_result['feature'] = feature
    return cl_result  

def compute_combined_pp(t1_df: DataFrame, t2_df: DataFrame) -> DataFrame:
    """ Compute the the combined ABFs posterior probabilities and credible sets
    Args:
        t1_df (pandas.DataFrame) risk or trait1's data
        t2_df (pandas.DataFrame) qtl or trait2's data
    Returns:
        (pandas.DataFrame) t1_df combined with t2_df with PP and credible sets ID'd
    """
    ret_df = merge(t1_df, t2_df, how='inner', on='variant_id', suffixes=('_risk', '_qtl'))
    # calculate the posterior probability for each variant
    ret_df['PP'] = clc.compute_pp(ret_df.logABF_risk + ret_df.logABF_qtl)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(ret_df)
    ret_df = ret_df.rename(columns={'PP': 'h4_pp'})
    return ret_df

### load the input data

#### load the preliminary check of signal sharing

In [None]:
%%time
shared_prelim = read_csv(shared_prelim_file)
print(f'shape of peliminary sharing results {shared_prelim.shape}')
if file_type == 'tensorqtl':
    shared_prelim = shared_prelim.rename(columns={'phenotype_id': 'trait'})
print(f'number of unique features {shared_prelim.trait.nunique()}')
print(f'features {shared_prelim.trait.unique()}')

if DEBUG:
    display(shared_prelim.head())

#### load the full gwas summary stats

In [None]:
%%time
gwas_stats_df = read_csv(gwas_sum_stats_file, sep='\t')
print(gwas_stats_df.shape)

if DEBUG:
    display(gwas_stats_df.sample(5))

#### some summary stats may have multiple results per variants so need to reduce or remove these
these are typically a results of variants that are multi-allelic like indels

In [None]:
gwas_stats_df = gwas_stats_df.drop_duplicates(subset=['variant_id'], keep='first')
print(gwas_stats_df.shape)

if DEBUG:
    display(gwas_stats_df.sample(5))

#### set case proportion for GWAS summary stats

In [None]:
gwas_stats_df['n_total'] = gwas_stats_df.n_cases + gwas_stats_df.n_controls
    
gwas_stats_df['case_prop'] = gwas_stats_df.n_cases / gwas_stats_df.n_total

### for each of the features that were detected in the risk and QTL sharing run the colocalization

#### if using Metal+ meta-analysis results load full QTL now

since the Metal meta results are in single parquet file load once instead of repeating in the loop

In [None]:
%%time
if file_type == 'metal+':
    qtl_full_file = f'{meta_dir}/{set_name}_meta_eqtl.parquet'
    qtl_full_df = read_parquet(qtl_full_file)
    # since annotated allele freqs from AMP-PD onto meta make sure no weird rare
    qtl_full_df = qtl_full_df.loc[qtl_full_df.af >= 0.01]    
    print(qtl_full_df.shape)
    number_genes = qtl_full_df.trait.nunique()
    print(f'full qtl for {number_genes} features')
    if DEBUG:
        display(qtl_full_df.head())

In [None]:
%%time
coloc_scores = []
coloc_h4_pps = None
for feature in shared_prelim.trait.unique():
    print(feature, end=', ')
    # prep the QTL results, qtl pulled differ by type
    if file_type == 'tensorqtl':
        # get the chromosome the feature is found on
        chrom = shared_prelim.loc[shared_prelim.trait == feature]['chromosome'].unique()[0]
        # load the feature's QTL results
        feature_df = load_qtl_results(str(chrom), tensorqtl_dir, set_name, feature)
        # make necessary keys consistent between file_types
        feature_df = feature_df.rename(columns={'pval_nominal': 'pvalue'})
        feature_df = process_qtl(feature_df, gwas_stats_df, num_qtl_samples)
    elif file_type == 'metal+':
        feature_df = qtl_full_df.loc[qtl_full_df.trait == feature].copy()
        # make necessary keys consistent between file_types
        feature_df = feature_df.rename(columns={'P-value': 'pvalue', 'Weight':'num_samples', 
                                                'variant': 'variant_id'})        
        feature_df = process_qtl(feature_df, gwas_stats_df, num_samples=0)
    # prep the GWAS results
    risk_df = process_gwas(gwas_stats_df, feature_df)
    # ensure that the risk and feature variants ABF's are ordered the same
    risk_df, feature_df = ensure_matched_indices(risk_df, feature_df)
    # perform the colocalization
    cl_result = colocalize(risk_df.logABF, feature_df.logABF, feature)
    # if H4 is supported then compute H4.PP the H4 credible sets
    cl_result['h4_supported'] = clc.h4_supported(cl_result)
    if cl_result.H4 > min_h4:
        combined_df = compute_combined_pp(risk_df, feature_df)
        coloc_h4_pps = concat([coloc_h4_pps, combined_df])        
    # add these scores to the rest
    coloc_scores.append(cl_result)
# create a dataframe from the list of coloc scores    
coloc_scores_df = DataFrame(coloc_scores)

### save the result files

In [None]:
coloc_scores_df.to_csv(coloc_scores_files, index=False)
if not coloc_h4_pps is None:
    coloc_h4_pps.to_parquet(coloc_casuals_files)
else:
    print('no H4 supported so no coloc_h4_pps')

### which features appear to have colocalization support

In [None]:
view_max_rows = 20
if coloc_scores_df.shape[0] <= view_max_rows:
    # if 10 or less entries show them all
    print('showing all results')
    display(coloc_scores_df)
else:
    # get just the H4 supported results
    temp = coloc_scores_df.loc[coloc_scores_df.h4_supported == True]
    if temp.shape[0] <= view_max_rows:
        # if 10 or less entries show them all
        print('showing all H4 supported results')
        display(temp)
    else:
        print('showing top H4 supported results')
        display(temp.sort_values('H4', ascending=False).head())

In [None]:
if not coloc_h4_pps is None:
    display(coloc_h4_pps.groupby('trait').is95_credset.value_counts())
    display(coloc_h4_pps.groupby('trait').is99_credset.value_counts())

In [None]:
!date