## Notebook to scan PD risk and QTL results for colocalization

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.multitest as smm
import concurrent.futures
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# parameters
cohort = 'foundin'
day = 'da65'
cell_type = 'iDA'

In [None]:
# naming
cohort_set = f'{cohort}.{day}.{cell_type}'

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/sceqtl'
quants_dir = f'{wrk_dir}/quants'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'
gwas_dir = f'{wrk_dir}/public'

# input files
gencode_pkl = f'{quants_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
qtl_results_file = f'{tensorqtl_dir}/{cohort_set}.cis.map.csv'
meta5_st2_clean_file = f'{gwas_dir}/pd.table_s2.clean.txt'
meta5_stats_file = f'{gwas_dir}/pdmeta_sumstats_hg38.h5'

# output files

# constant values
autosomes = [str(x) for x in list(range(1,23))]
alpha_value = 0.05
cpu_count = len(os.sched_getaffinity(0))

#### analysis functions

In [None]:
# functions to run the linear regression
def reg_model(y, x):
    Xb = sm.add_constant(x)
    reg_model = sm.OLS(y, Xb).fit()
    return reg_model

def regress_scores_keep(this_df, verbose=False):
    ret_value = False
    # run the regression, mixed effects model with day as random effect
    this_formula = 'gwas_score_abs ~ qtl_score_abs'
    result = smf.ols(this_formula, data=this_df).fit()
#     result = reg_model(this_df['gwas_score_abs'], this_df['qtl_score_abs'])
    if verbose:
        print(result.summary())
    term = 'qtl_score_abs'
    return [result.params[term], result.bse[term], result.rsquared_adj,
            result.params.shape[0], result.pvalues[term]]

def create_merged_df_to_regress(qtl_df, pheno_id, gwas_df):
    pheno_df = qtl_df.loc[qtl_df['phenotype_id'] == pheno_id]
    merged_df = pheno_df.merge(gwas_df, how='inner', 
                               left_on='variant_id', right_on='SNP')
    # will test regression on absolute score instead of say p-value
    merged_df['gwas_score_abs'] = np.abs(merged_df['b']/merged_df['se'])
    merged_df['qtl_score_abs'] = np.abs(merged_df['slope']/merged_df['slope_se'])

    # for regression will only consider variants that are nominally
    # significant in gwas or qtl
    df_to_return = merged_df.loc[(merged_df['pval_nominal'] < alpha_value) | 
                                 (merged_df['p'] < alpha_value)]
    #     print(pheno_df.shape, merged_df.shape, alpha_df.shape)
    return df_to_return

def load_chrom_result(chrom, in_dir, cohort_set, variants_df):
    # have to do pass to find all phenos to possible capture
    df_to_return = None
    phenos_oi = []
    chrom_file = f'{in_dir}/{cohort_set}.cis_qtl_pairs.chr{chrom}.parquet'
    chrom_qtl_df = pd.read_parquet(chrom_file)
    oi_chrom_qtl_df = chrom_qtl_df.loc[chrom_qtl_df['pval_nominal'] < max_pvalue]
    oi_results = oi_chrom_qtl_df.loc[oi_chrom_qtl_df['variant_id'].isin(variants_oi_df['SNP'])]
    phenos_oi = phenos_oi + list(oi_results['phenotype_id'].unique())

    # do pass to keep results that belong those phenos
    possible_results_oi = chrom_qtl_df.loc[chrom_qtl_df['phenotype_id'].isin(phenos_oi)].copy()
    df_to_return = pd.concat([df_to_return, possible_results_oi])
    return phenos_oi, df_to_return

def process_regression_check(chrom: str, in_dir: str, cohort: str, 
                             gwas_df: pd.DataFrame, 
                             variants_df: pd.DataFrame) -> pd.DataFrame:
    results_to_keep = None
    phenos_oi, results_to_test = load_chrom_result(chrom, in_dir, cohort, variants_df)
    print(f'chr {chrom} shape {results_to_test.shape}')
    # display(results_to_test.sample(5))

    # test the GWAS~QTL regression for possible significance
    for phenotype_id in phenos_oi:
        alpha_df = create_merged_df_to_regress(results_to_test, phenotype_id, gwas_df)
        
        # ['coef', 'stderr', 'r2adj', term_cnt', 'p-value']
        ret_vals = regress_scores_keep(alpha_df)
        # must have postive coefficient and nomically significant p-value
        if ret_vals[0] > 0 and ret_vals[4] < alpha_value:
            results_to_keep = pd.concat([results_to_keep, alpha_df])
            
    return results_to_keep

#### load the gencode annotations

In [None]:
%%time
gencode_df = pd.read_pickle(gencode_pkl)
# drop the ont and tag columns
discard_cols = gencode_df.columns[(gencode_df.columns.str.startswith('ont:')) |
                                (gencode_df.columns.str.startswith('tag:'))]
gencode_df.drop(columns=discard_cols, inplace=True)
# should only be autosomal but sometimes annotation quirks allow in others, so force
gencode_df = gencode_df.loc[(gencode_df['seqname'].str.startswith('chr')) & 
                           (~gencode_df['seqname'].isin(['chrX','chrY','chrM']))]
# every now and again having problem with mics_RNA genes mapping every chrom so drop
gencode_df = gencode_df.loc[~gencode_df['gene_type'].isin(['misc_RNA'])]
print(gencode_df.shape)
display(gencode_df.head())

#### load the risk variants of interest

In [None]:
%%time
variants_oi_df = pd.read_csv(meta5_st2_clean_file, sep='\t')
print(variants_oi_df.shape)
variants_oi_df.head()

#### load the full gwas summary stats

In [None]:
%%time
gwas_stats_df = pd.read_hdf(meta5_stats_file)
print(gwas_stats_df.shape)
display(gwas_stats_df.sample(5))

#### load the QTL results

In [None]:
qtl_df = pd.read_csv(qtl_results_file)
print(qtl_df.shape)
number_genes = len(qtl_df['phenotype_id'].unique())
print(f'qtl for {number_genes} genes')
qtl_df.head()

#### grab the max pval_nominal and max pval_beta

In [None]:
sig_df = qtl_df.loc[qtl_df['bh_fdr'] < alpha_value]
feature_cnt = len(sig_df['phenotype_id'].unique())
print(f'sig shape is {sig_df.shape} for {feature_cnt} features')

max_pvalue = sig_df['pval_nominal'].max()
print(f'max p-value: {max_pvalue}')
max_pvalue = 0.001
print(f'hardcoded max p-value: {max_pvalue}')

#### see if any of the top significant results happen to be risk independent variants

In [None]:
def check_qtls_for_variants(qtl_df, variants_df):
    qtl_variants_oi_df = qtl_df.loc[qtl_df['variant_id'].isin(variants_df['SNP'])]
    print(qtl_variants_oi_df.shape)
    this_cnt = len(qtl_variants_oi_df['variant_id'].unique())
    print(f'variants {this_cnt}')
    print(qtl_variants_oi_df['variant_id'].unique())
    oi_genes = qtl_variants_oi_df['phenotype_id'].unique()
    this_cnt = len(oi_genes)
    print(f'genes {this_cnt}')
    print(oi_genes)
    return qtl_variants_oi_df

In [None]:
risk_genes = check_qtls_for_variants(qtl_df, variants_oi_df)
if len(risk_genes) > 0:
    display(risk_genes.head())

#### now load rest of results and see if risk index variants are QTL

In [None]:
import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

In [None]:
%%time

fs_list = []
lm_results = []
with concurrent.futures.ProcessPoolExecutor() as ppe:
    for chrom in autosomes:
        fs_list.append(ppe.submit(process_regression_check, chrom, 
                                  tensorqtl_dir, cohort_set, 
                                  gwas_stats_df, variants_oi_df))

In [None]:
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

# flatten the list
results_to_keep = pd.concat([item for item in lm_results])

In [None]:
print(results_to_keep.shape)
display(results_to_keep.sample(5))

In [None]:
print(len(results_to_keep['phenotype_id'].unique()))
print(results_to_keep['phenotype_id'].unique())

In [None]:
genes_kept  = gencode_df.loc[gencode_df['gene_name'].isin(results_to_keep['phenotype_id']), 
                             ['gene_id']]['gene_id'].unique()
print(len(genes_kept))
print(genes_kept)

In [None]:
import random
phenotype_id = random.choice(results_to_keep['phenotype_id'].unique())
print(phenotype_id)

In [None]:
gencode_df.loc[gencode_df['gene_name'] == phenotype_id, 
               ['gene_id']]['gene_id'].unique()

In [None]:
# phenotype_id = 'ENSG00000143537.13'
# phenotype_id = 'ENSG00000164733.20'
temp = results_to_keep.loc[results_to_keep['phenotype_id'] == phenotype_id]
print(temp.shape)
display(temp.head())

In [None]:
sns.lmplot(x='gwas_score_abs', y='qtl_score_abs', data=temp)

In [None]:
sns.relplot(x='gwas_score_abs', 
            y='qtl_score_abs', 
            alpha=.5, palette="dark", 
            height=12, data=temp) 

In [None]:
this_formula = 'gwas_score_abs ~ qtl_score_abs'
result = reg_model(temp['gwas_score_abs'].fillna(0), temp['qtl_score_abs'].fillna(0))
print(result.summary())

In [None]:
term = 'qtl_score_abs'
print(['feature', 'coef', 'stderr', 'r2adj', 'term_cnt', 'p-value'])
print(result.params[term], result.bse[term], result.rsquared_adj,
      result.params.shape[0], result.pvalues[term])

In [None]:
this_formula = 'gwas_score_abs ~ qtl_score_abs'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())