## Notebook to scan PD risk and QTL results for intersection

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, DataFrame, read_parquet, concat
import numpy as np
from os import sched_getaffinity
import statsmodels.api as sm
import statsmodels.formula.api as smf
import concurrent.futures
from seaborn import lmplot, relplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from random import choice

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
day = ''
modality = ''

In [None]:
# naming
cohort = 'foundin'
set_name = f'{cohort}_{day}_{modality}'
dx = 'PD'

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
results_dir = f'{wrk_dir}/results'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
public_dir = f'{wrk_dir}/public'

# input files
qtl_results_file = f'{tensorqtl_dir}/{set_name}.cis.map.csv'
# with agreement in place use full summary stats
# gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_no23andme_buildGRCh38.tsv.gz'
gwas_sum_stats_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_23andme_buildGRCh38.tsv.gz'
index_variants_file = f'{public_dir}/nalls_pd_gwas/index_variants.list'    

# output files
shared_out_file = f'{results_dir}/{set_name}_{dx}.prelim_shared.cis.csv'

# constant values
alpha_value = 0.05
cpu_count = len(sched_getaffinity(0))
DEBUG = False
linear_check = False

#### analysis functions

In [None]:
# functions to run the linear regression
def reg_model(y, x):
    Xb = sm.add_constant(x)
    model = sm.OLS(y, Xb).fit()
    return model

def regress_scores_keep(this_df, verbose=False):
    ret_value = False
    # run the regression, mixed effects model with day as random effect
    this_formula = 'gwas_score_abs ~ qtl_score_abs'
    result = smf.ols(this_formula, data=this_df).fit()
#     result = reg_model(this_df['gwas_score_abs'], this_df['qtl_score_abs'])
    if verbose:
        print(result.summary())
    term = 'qtl_score_abs'
    return [result.params[term], result.bse[term], result.rsquared_adj,
            result.params.shape[0], result.pvalues[term]]

def create_merged_df_to_regress(qtl_df, pheno_id, gwas_df):
    pheno_df = qtl_df.loc[qtl_df['phenotype_id'] == pheno_id]
    merged_df = pheno_df.merge(gwas_df, how='inner', 
                               left_on='variant_id', right_on='variant_id')
    # will test regression on absolute score instead of say p-value
    merged_df['gwas_score_abs'] = np.abs(merged_df['beta']/merged_df['standard_error'])
    merged_df['qtl_score_abs'] = np.abs(merged_df['slope']/merged_df['slope_se'])

    # for regression will only consider variants that are nominally
    # significant in gwas or qtl
    # df_to_return = merged_df.loc[(merged_df['pval_nominal'] < alpha_value) | 
    #                              (merged_df['p_value'] < alpha_value)]
    #     print(pheno_df.shape, merged_df.shape, alpha_df.shape)
    df_to_return = merged_df
    return df_to_return

def load_chrom_result(chrom, in_dir, name, variants: list):
    # have to do pass to find all phenos to possible capture
    df_to_return = None
    phenos_oi = []
    chrom_file = f'{in_dir}/{name}.cis_qtl_pairs.chr{chrom}.parquet'
    chrom_qtl_df = read_parquet(chrom_file)
    oi_chrom_qtl_df = chrom_qtl_df.loc[chrom_qtl_df['pval_nominal'] < max_pvalue]
    oi_results = oi_chrom_qtl_df.loc[oi_chrom_qtl_df['variant_id'].isin(variants)]
    phenos_oi = phenos_oi + list(oi_results['phenotype_id'].unique())

    # do pass to keep results that belong those phenos
    possible_results_oi = chrom_qtl_df.loc[chrom_qtl_df['phenotype_id'].isin(phenos_oi)].copy()
    df_to_return = concat([df_to_return, possible_results_oi])
    return phenos_oi, df_to_return

def load_chrom_index_results(chrom: str, in_dir: str, name: str, variants: list):
    _, ret_df = load_chrom_result(chrom, in_dir, name, variants)
    ret_df['chromosome'] = chrom
    return ret_df

def process_regression_check(chrom: str, in_dir: str, name: str, 
                             gwas_df: DataFrame, 
                             variants: list) -> DataFrame:
    ret_shared = None
    phenos_oi, results_to_test = load_chrom_result(chrom, in_dir, name, variants)
    print(f'chr {chrom} shape {results_to_test.shape}')
    # display(results_to_test.sample(5))

    # test the GWAS~QTL regression for possible significance
    for phenotype_id in phenos_oi:
        alpha_df = create_merged_df_to_regress(results_to_test, phenotype_id, gwas_df)
        
        # ['coef', 'stderr', 'r2adj', term_cnt', 'p-value']
        ret_sharing = regress_scores_keep(alpha_df)
        # must have postive coefficient and nomically significant p-value
        if ret_sharing[0] > 0 and ret_sharing[4] < alpha_value:
            ret_shared = concat([ret_shared, alpha_df])
        else:
            # pull just the merged signals for the index variants
            merged_index_vars = alpha_df.loc[alpha_df.variant_id.isin(variants)]
            ret_shared = concat([ret_shared, merged_index_vars])
            
    return ret_shared

### load the input data

#### load the risk variants of interest

In [None]:
%%time
variants_oi_df = read_csv(index_variants_file)
print(variants_oi_df.shape)
index_variants = list(variants_oi_df.variant.unique())
if DEBUG:
    display(variants_oi_df.head())
    print(index_variants)
 

#### load the full gwas summary stats

In [None]:
%%time
gwas_stats_df = read_csv(gwas_sum_stats_file, sep='\t')
print(gwas_stats_df.shape)
if DEBUG:
    display(gwas_stats_df.sample(5))

#### subset index variant stats

In [None]:
index_stats_df = gwas_stats_df.loc[gwas_stats_df.variant_id.isin(index_variants)]
print(index_stats_df.shape)
if DEBUG:
    display(index_stats_df.head())

#### load the QTL results

In [None]:
qtl_df = read_csv(qtl_results_file)
print(qtl_df.shape)
number_genes = qtl_df['phenotype_id'].nunique()
print(f'qtl for {number_genes} features')
if DEBUG:
    display(qtl_df.head())

### grab the max pval_nominal and max pval_beta

In [None]:
sig_df = qtl_df.loc[qtl_df['bh_fdr'] < alpha_value]
feature_cnt = len(sig_df['phenotype_id'].unique())
print(f'sig shape is {sig_df.shape} for {feature_cnt} features')

max_pvalue = sig_df['pval_nominal'].max()
print(f'max p-value: {max_pvalue}')
max_pvalue = 0.01
print(f'hardcoded max p-value: {max_pvalue}')

### see if any of the top significant results happen to be top QTL variants

In [None]:
def check_qtls_for_variants(qtl_df: DataFrame, variants: list) -> DataFrame:
    qtl_variants_oi_df = qtl_df.loc[qtl_df['variant_id'].isin(variants)]
    print(qtl_variants_oi_df.shape)
    this_cnt = len(qtl_variants_oi_df['variant_id'].unique())
    print(f'variants {this_cnt}')
    print(qtl_variants_oi_df['variant_id'].unique())
    oi_features = qtl_variants_oi_df['phenotype_id'].unique()
    this_cnt = len(oi_features)
    print(f'features {this_cnt}')
    print(oi_features)
    return qtl_variants_oi_df

In [None]:
risk_features = check_qtls_for_variants(qtl_df.loc[qtl_df['pval_perm'] < alpha_value], 
                                        index_variants)
if len(risk_features) > 0:
    display(risk_features.head())

### check full results and see if risk appears to colocalize with QTL

In [None]:
import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

In [None]:
%%time

# get the list of chromsomes that have risk variant
risk_chroms = list(index_stats_df.chromosome.unique())

fs_list = []
lm_results = []
with concurrent.futures.ThreadPoolExecutor() as tpe:
    for chrom in risk_chroms:
        if linear_check:
            fs_list.append(tpe.submit(process_regression_check, chrom, 
                                      tensorqtl_dir, set_name, 
                                      gwas_stats_df, index_variants))
        else:
            fs_list.append(tpe.submit(load_chrom_index_results, chrom, 
                                      tensorqtl_dir, set_name, index_variants))            

In [None]:
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

# flatten the list
results_to_keep = concat([item for item in lm_results])

In [None]:
print(results_to_keep.shape)
display(results_to_keep.sample(5))

### features that may colocalize with risk

In [None]:
print(results_to_keep['phenotype_id'].nunique())
print(results_to_keep['phenotype_id'].unique())

### save the potential risk and QTL colocalization results

In [None]:
results_to_keep.to_csv(shared_out_file, index=False)

### take a look at a random potentially colocalized risk

In [None]:
if linear_check:
    temp_cnts = results_to_keep.phenotype_id.value_counts()
    # make sure not just index variant possible resunt
    temp_cnts = temp_cnts[temp_cnts > 100]
    phenotype_id = choice(temp_cnts.index)
    print(phenotype_id)

In [None]:
if linear_check:
    temp = results_to_keep.loc[results_to_keep['phenotype_id'] == phenotype_id]
    print(temp.shape)
    display(temp.head())

In [None]:
if linear_check:
    with rc_context({'figure.figsize': (9, 9), 'figure.dpi': 100}):
        plt.style.use('seaborn-bright')
        lmplot(x='gwas_score_abs', y='qtl_score_abs', data=temp)

In [None]:
if linear_check:
    with rc_context({'figure.figsize': (9, 9), 'figure.dpi': 100}):
        plt.style.use('seaborn-bright')
        relplot(x='gwas_score_abs', y='qtl_score_abs', 
                alpha=.5, palette="dark", data=temp) 

In [None]:
if linear_check:
    this_formula = 'gwas_score_abs ~ qtl_score_abs'
    result = reg_model(temp['gwas_score_abs'].fillna(0), temp['qtl_score_abs'].fillna(0))
    print(result.summary())

In [None]:
if linear_check:
    term = 'qtl_score_abs'
    print(['coef', 'stderr', 'r2adj', 'term_cnt', 'p-value'])
    print(result.params[term], result.bse[term], result.rsquared_adj,
          result.params.shape[0], result.pvalues[term])

In [None]:
if linear_check:
    this_formula = 'gwas_score_abs ~ qtl_score_abs'
    reg_model = smf.ols(this_formula, data=temp).fit()
    print(reg_model.summary())

In [None]:
!date