## Notebook for scan the public Bryois et al Brain cell-type eQTL results

- [Bryois et. al.](https://pubmed.ncbi.nlm.nih.gov/35915177/)
    - Bryois J, Calini D, Macnair W et al. Cell-type-specific cis-eQTLs in eight human brain cell types identify novel risk genes for psychiatric and neurological disorders. Nat Neurosci 2022;25:1104–12.
    - 192 individuals, snRNA, 8 CNS cell-types, multiple cohorts, mostly DLPFC, but also temporal cortex and deep white matter
    - downloaded files are full fastQTL results per cell-type per autosome
    - provided snp_pos file that includes allele info
    - fastQTL only outputs p-values and betas (coefficients)
    
Scan the Bryois results for any nominal intersect with PD risk and for these intersects then run colocalization

In [1]:
!date

Tue Jan 17 18:14:47 UTC 2023


#### import libraries

In [2]:
from pandas import read_csv, DataFrame, Series, merge, concat
from threading import Thread
from numpy import around
import colocalization as clc
from threading import Thread

#### set notebook variables

In [3]:
# naming
set_name_frmt = 'foundin_daNA_Bryois-{cell_abbrv}'
dx = 'PD'

# directories 
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
public_dir = f'{wrk_dir}/public'
results_dir = f'{wrk_dir}/results'

# input files
# with agreement in place use full summary stats
# gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_no23andme_buildGRCh38.tsv.gz'
gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_23andme_buildGRCh38.tsv.gz'
index_variants_file = f'{public_dir}/nalls_pd_gwas/index_variants.list'  

# variables
DEBUG = False
max_nominal = 0.01
num_qtl_samples = 192
cell_names_dict = {'Astrocytes': 'Astro', 'Endothelial.cells': 'Endo', 
                   'Excitatory.neurons': 'ExN', 'Inhibitory.neurons': 'InN', 
                   'Microglia': 'Micro', 'OPCs...COPs': 'OPC', 
                   'Oligodendrocytes': 'Oligo', 'Pericytes': 'Peri'}
min_h4 = 0.5

#### functions

In [4]:
def load_chrom_result(chrom, in_file, variants: list, verbose: bool=False):
    # have to do pass to find all phenos to possible capture
    chrom_qtl_df = read_csv(in_file, sep='\s+', header=None)
    chrom_qtl_df.columns = ['gene_info', 'variant_id', 'tss_distance', 'pval_nominal', 'slope']
    if verbose:
        print(f'read {chrom_qtl_df.shape}')
    # split the gene into gene name and id
    temp_df = chrom_qtl_df.gene_info.str.split('_', n=1, expand=True)
    temp_df.columns = ['gene_name', 'gene_id']
    chrom_qtl_df['gene_id'] = temp_df.gene_id
    chrom_qtl_df['phenotype_id'] = temp_df.gene_name
    if verbose:
        print(f'after splitting gene info {chrom_qtl_df.shape}')    
    out_columns = ['phenotype_id', 'variant_id', 'tss_distance', 'pval_nominal', 'slope']
    chrom_qtl_df = chrom_qtl_df[out_columns]
    # find traits tested against variants of interest with sufficicent p-value
    oi_results = chrom_qtl_df.loc[(chrom_qtl_df['variant_id'].isin(variants)) & 
                                  (chrom_qtl_df['pval_nominal'] <= max_nominal)]
    phenos_oi = list(oi_results['phenotype_id'].unique())
    # do pass to keep results that belong those phenos
    possible_results_oi = chrom_qtl_df.loc[chrom_qtl_df['phenotype_id'].isin(phenos_oi)].copy()
    if verbose:
        display(possible_results_oi.head())
        print(phenos_oi)
    return phenos_oi, possible_results_oi

def process_qtl(trait_df: DataFrame, other_stats: DataFrame, num_samples: int) -> DataFrame:
    """ Prep the QTL (or trait2) results for use in colocalization. 
        Where prep performs Wakefield Approx Bayes Factor, posterior probabliltiy,
        and credible sets calculations and identification
    Args:
        trait_df (pandas.DataFrame) QTL results for a feature
        other_stats (pandas.DataFrame) trait1 (or risk) stats to be used with these
            qtl (or trait2) stats for colocalization
        num_samples (int) number of samples used in for the qtl analysis, if set to 0 then
            number of samples is present in results per variant
    Returns:
        (pandas.DataFrame) qtl results with ABF, PP, and credible sets computed
    """
    # some feature QTL stats may also have multiple results per variants 
    # so need to reduce or remove these
    # these are typically a results of variants that are multi-allelic like indels
    trait_df = trait_df.drop_duplicates(subset=['variant_id'], keep='first')
    # calculate the ABF's for the feature's QTL results
    trait_df["logABF"] = trait_df.apply(
    lambda result: clc.calc_abf(pval=result.pval_nominal, maf=clc.freq_to_maf(result.allele_frequency),
                                n=num_samples if num_samples != 0 else int(result.num_samples)), 
                                axis=1)    
    trait_df = trait_df.sort_values("logABF", ascending=False)
    # calculate the posterior probability for each variant
    trait_df['PP'] = clc.compute_pp(trait_df.logABF)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(trait_df)
    # subset the feature QTL variants to just those present in the GWAS
    trait_df = trait_df.loc[trait_df.variant_id.isin(other_stats.variant_id)] 
    return trait_df

def process_gwas(trait_stats: DataFrame, other_stats: DataFrame) -> DataFrame:
    """ Prep the risk (or trait1) results for use in colocalization. 
        Where prep performs subet of variant to those present in other (trait2/qtl),
        Wakefield Approx Bayes Factor, posterior probabliltiy,
        and credible sets calculations and identification
    Args:
        trait_stats (pandas.DataFrame) trait1 (or risk) stats to be used with these
            qtl (or trait2) stats for colocalization
        other_stats (pandas.DataFrame) trait2 (or qtl) stats to be used with these
            risk (or trait2) stats for colocalization
    Returns:
        (pandas.DataFrame) risk results with ABF, PP, and credible sets computed
    """ 
    # subset the risk summary stats by the feature's QTL variants present
    ret_df = trait_stats.loc[trait_stats.variant_id.isin(other_stats.variant_id)].copy()
    # calculate the ABF's for the risk results
    ret_df['logABF'] = ret_df.apply(
        lambda result: clc.calc_abf(pval=result.p_value, 
                                    maf=clc.freq_to_maf(result.effect_allele_frequency),
                                    n=result.n_total, 
                                    prop_cases=result.case_prop), axis=1)
    ret_df = ret_df.sort_values('logABF', ascending=False)  
    # calculate the posterior probability for each variant
    ret_df['PP'] = clc.compute_pp(ret_df.logABF)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(ret_df)
    return ret_df

def ensure_matched_indices(df1: DataFrame, df2: DataFrame) -> {DataFrame, DataFrame}:
    """ make sure the two datasets are ordered the same
        modifies both df1 and df2
    Args:
        df1 (pandas.DataFrame) risk or trait1 data
        df2 (pandas.DataFrame) qtl or trait2 data
    """ 
    # ensure that the risk and feature variants ABF's are ordered the same
    df1.set_index('variant_id', inplace=True)
    df2.set_index('variant_id', inplace=True)
    shared_indices = df1.index.intersection(df2.index)
    df1 = df1.loc[shared_indices,]
    df2 = df2.loc[shared_indices,]
    temp = df1.index.values == df2.index.values
    return df1, df2

def colocalize(t1_abfs, t2_abfs, feature: str) -> Series:
    """ Perform the colocalization between trait1 and trait2 ABFs
    Args:
        t1_abfs (array_like) trait1's ABFs
        t2_abfs (array_like) trait2's ABFs
        feature (string) trait2's name or ID
    Returns:
        (pandas.Series) named colocalization posterior probabilities
    """
    h_probs = clc.combine_abf(t1_abfs, t2_abfs)
    names = [f'H{x}' for x in range(5)]
    cl_result = Series(data=around(h_probs, decimals=3), index=names)
    cl_result['feature'] = feature
    return cl_result  

def compute_combined_pp(t1_df: DataFrame, t2_df: DataFrame) -> DataFrame:
    """ Compute the the combined ABFs posterior probabilities and credible sets
    Args:
        t1_df (pandas.DataFrame) risk or trait1's data
        t2_df (pandas.DataFrame) qtl or trait2's data
    Returns:
        (pandas.DataFrame) t1_df combined with t2_df with PP and credible sets ID'd
    """
    ret_df = merge(t1_df, t2_df, how='inner', on='variant_id', suffixes=('_risk', '_qtl'))
    # calculate the posterior probability for each variant
    ret_df['PP'] = clc.compute_pp(ret_df.logABF_risk + ret_df.logABF_qtl)
    # identify the credible set(s), 95% and 99%, the the posterior probabilities
    clc.credible_sets(ret_df)
    ret_df.rename(columns={'PP': 'h4_pp'}, inplace=True)
    return ret_df

def process_cell_type(cell_type: str, cell_abbrv: str):
    coloc_scores = []
    coloc_h4_pps = None    
    # process data by chromosome
    for chrom in risk_chroms:
        bryios_file = f'{public_dir}/bryois_brain_eqtl/{cell_type}.{chrom}.gz'
        features_oi, results_to_test = load_chrom_result(chrom, bryios_file, index_variants)
        if DEBUG:
            print(f'chr{chrom}', end='.')            
            print(features_oi)
            print(results_to_test.shape)
        # the allele freq's are not available in the fastq summary stats, 
        # here assume same population(s) are risk and use GWAS allele freqs
        results_to_test = results_to_test.merge(gwas_stats_df.loc[gwas_stats_df.chromosome == chrom, 
                                                                  ['variant_id', 'effect_allele_frequency']], 
                                                how='left', on='variant_id')
        # rename the allefe freq column
        results_to_test.rename(columns={'effect_allele_frequency': 'allele_frequency'}, inplace=True)
        # drop any with missing freq after merge
        results_to_test = results_to_test.loc[~results_to_test.allele_frequency.isna()]
        for feature in features_oi:
            feature_df = results_to_test.loc[results_to_test.phenotype_id == feature]
            feature_df = process_qtl(feature_df, gwas_stats_df, num_qtl_samples)
            # prep the GWAS results
            risk_df = process_gwas(gwas_stats_df, feature_df)
            # ensure that the risk and feature variants ABF's are ordered the same
            risk_df, feature_df = ensure_matched_indices(risk_df, feature_df)
            # perform the colocalization
            cl_result = colocalize(risk_df.logABF, feature_df.logABF, feature)
            # if H4 is supported then compute H4.PP the H4 credible sets
            cl_result['h4_supported'] = clc.h4_supported(cl_result)
            if cl_result.H4 >= min_h4:
                combined_df = compute_combined_pp(risk_df, feature_df)
                coloc_h4_pps = concat([coloc_h4_pps, combined_df])                  
            # add these scores to the rest
            coloc_scores.append(cl_result)
    # create a dataframe from the list of coloc scores    
    coloc_scores_df = DataFrame(coloc_scores)
    ### save the result files for this cell-type
    set_name = set_name_frmt.format(cell_abbrv=cell_abbrv)
    coloc_scores_files = f'{results_dir}/{set_name}_{dx}.coloc.pp.csv'
    coloc_casuals_files = f'{results_dir}/{set_name}_{dx}.casuals.pp.parquet'
    coloc_scores_df.to_csv(coloc_scores_files, index=False)
    if not coloc_h4_pps is None:
        coloc_h4_pps.to_parquet(coloc_casuals_files)
    else:
        print(f'{cell_type} no H4 supported so no coloc_h4_pps')    

### load the input data

#### load the risk variants of interest

In [5]:
%%time
variants_oi_df = read_csv(index_variants_file)
print(variants_oi_df.shape)
index_variants = list(variants_oi_df.variant.unique())
if DEBUG:
    display(variants_oi_df.head())
    print(index_variants)

(91, 1)
CPU times: user 3.02 ms, sys: 0 ns, total: 3.02 ms
Wall time: 2.62 ms


#### load the full gwas summary stats

In [6]:
%%time
gwas_stats_df = read_csv(gwas_sum_stats_file, sep='\t')
print(gwas_stats_df.shape)
if DEBUG:
    display(gwas_stats_df.sample(5))

(7769022, 12)
CPU times: user 10.2 s, sys: 885 ms, total: 11.1 s
Wall time: 11.1 s


#### set case proportion for GWAS summary stats

In [7]:
gwas_stats_df['n_total'] = gwas_stats_df.n_cases + gwas_stats_df.n_controls
    
gwas_stats_df['case_prop'] = gwas_stats_df.n_cases / gwas_stats_df.n_total

#### subset index variant stats

In [8]:
index_stats_df = gwas_stats_df.loc[gwas_stats_df.variant_id.isin(index_variants)]
print(index_stats_df.shape)
if DEBUG:
    display(index_stats_df.head())

(88, 13)


### load data, format data, and save new input files

#### Bryois et al result

#### load and analyze the Bryois results

In [9]:
# get the list of chromsomes that have risk variant
risk_chroms = list(index_stats_df.chromosome.unique())
print(risk_chroms)

[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1, 20, 21, 2, 3, 4, 5, 6, 7, 8, 9]


In [10]:
%%time
job_threads = []
for cell_type, cell_abbrv in cell_names_dict.items():
    print(f'#### {cell_type} ####')
    this_thread = Thread(target=process_cell_type, args=(cell_type, cell_abbrv))
    job_threads.append(this_thread)
    this_thread.start()
for job_thread in job_threads:
        job_thread.join()        

#### Astrocytes ####
#### Endothelial.cells ####
#### Excitatory.neurons ####
#### Inhibitory.neurons ####
#### Microglia ####
#### OPCs...COPs ####
#### Oligodendrocytes ####
#### Pericytes ####
CPU times: user 35min 13s, sys: 3min 10s, total: 38min 24s
Wall time: 30min 43s


In [11]:
!date

Tue Jan 17 18:45:43 UTC 2023
