## Notebook to scan PD risk and ieQTL results for colocalization

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import concurrent.futures
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# parameters
cohort = 'foundin'
interaction_term = 'ProliferatingFloorPlateProgenitors'

In [None]:
# naming

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/{cohort}/eqtl'
expr_dir = f'{wrk_dir}/expression'
genos_dir = f'{wrk_dir}/genotypes'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'
gwas_dir = f'{wrk_dir}/public'

# input files
gencode_pkl = f'{expr_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
eqtl_results_file = f'{results_dir}/{cohort}.{interaction_term}.cis.ieqtl.csv'
meta5_st2_clean_file = f'{gwas_dir}/pd.table_s2.clean.txt'
meta5_stats_file = f'{gwas_dir}/pdmeta_sumstats_hg38.h5'

# output files

# constant values
autosomes = [str(x) for x in list(range(1,23))]
alpha_value = 0.05
capture_out = !(nproc)
max_threads = int(capture_out[0])


#### analysis functions

In [None]:
def mixed_model(formula, df, group_name):
    model = sm.MixedLM.from_formula(formula, df, groups=df[group_name])
    result = model.fit()
    return result

def regress_tscores_keep(this_df):
    ret_value = False
    # run the regression, mixed effects model with day as random effect
    this_formula = 'gwas_tscore_abs ~ eqtl_tscore_abs'
    grouping = 'day'
    result = mixed_model(this_formula, this_df, grouping)
#     print(result.summary())
    term = 'eqtl_tscore_abs'
#     print(['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
#     print(result.params[term], result.bse[term], 
#           result.params.shape[0], result.pvalues[term])    
    return [result.params[term], result.bse[term], 
            result.params.shape[0], result.pvalues[term]]

def create_merged_df_to_regress(eqtl_df, pheno_id, gwas_df):
    pheno_df = eqtl_df.loc[eqtl_df['phenotype_id'] == pheno_id]
    merged_df = pheno_df.merge(gwas_df, how='inner', 
                               left_on='variant_id', right_on='SNP')
    # will test regression on absolute z-score instead of say p-value
    merged_df['day'] =  merged_df['day'].astype(object)
    merged_df['gwas_tscore_abs'] = np.abs(merged_df['b']/merged_df['se'])
    merged_df['eqtl_tscore_abs'] = np.abs(merged_df['b_gi']/merged_df['b_gi_se'])

    # for regression will only consider variants that are nominally
    # significant in gwas or eqtl
    df_to_return = merged_df.loc[(merged_df['pval_gi'] < alpha_value) | 
                                 (merged_df['p'] < alpha_value)]
    #     print(pheno_df.shape, merged_df.shape, alpha_df.shape)
    return df_to_return

def load_chrom_result(chrom, days, in_dir, cohort, term):
    # have to do pass to find all phenos to possible capture
    df_to_return = None
    phenos_oi = []
    for day in days:
        chrom_file = f'{in_dir}/{cohort}.{day}.{term}.cis_qtl_pairs.chr{chrom}.parquet'
        chrom_eqtl_df = pd.read_parquet(chrom_file)
        oi_chrom_eqtl_df = chrom_eqtl_df.loc[chrom_eqtl_df['pval_gi'] < max_pvalue]
        oi_results = oi_chrom_eqtl_df.loc[oi_chrom_eqtl_df['variant_id'].isin(variants_oi_df['SNP'])]
        phenos_oi = phenos_oi + list(oi_results['phenotype_id'].unique())

    # do pass to keep results that belong those phenos
    for day in days:
        chrom_file = f'{in_dir}/{cohort}.{day}.{term}.cis_qtl_pairs.chr{chrom}.parquet'
        chrom_eqtl_df = pd.read_parquet(chrom_file)
        possible_results_oi = chrom_eqtl_df.loc[chrom_eqtl_df['phenotype_id'].isin(phenos_oi)].copy()
        possible_results_oi['day'] = day
        df_to_return = pd.concat([df_to_return, possible_results_oi])
    return phenos_oi, df_to_return

def process_regression_check(chrom, days, in_dir, cohort, term, gwas_df):
    results_to_keep = None
    phenos_oi, results_to_test = load_chrom_result(chrom, days, in_dir, cohort, term)
    print(f'chr {chrom} shape {results_to_test.shape}')
    # display(results_to_test.sample(5))

    # test the GWAS~eQTL regression for possible significance
    for phenotype_id in phenos_oi:
        alpha_df = create_merged_df_to_regress(results_to_test, phenotype_id, gwas_df)

        # ['coef', 'stderr', 'term_cnt', 'p-value']
        ret_vals = regress_tscores_keep(alpha_df)
        # must have postive coefficient and nomically significant p-value
        if ret_vals[0] > 0 and ret_vals[3] < alpha_value:
            results_to_keep = pd.concat([results_to_keep, alpha_df])
    return results_to_keep

#### load the gencode annotations

In [None]:
%%time
gencode_df = pd.read_pickle(gencode_pkl)
# drop the ont and tag columns
discard_cols = gencode_df.columns[(gencode_df.columns.str.startswith('ont:')) |
                                (gencode_df.columns.str.startswith('tag:'))]
gencode_df.drop(columns=discard_cols, inplace=True)
# should only be autosomal but sometimes annotation quirks allow in others, so force
gencode_df = gencode_df.loc[(gencode_df['seqname'].str.startswith('chr')) & 
                           (~gencode_df['seqname'].isin(['chrX','chrY','chrM']))]
# every now and again having problem with mics_RNA genes mapping every chrom so drop
gencode_df = gencode_df.loc[~gencode_df['gene_type'].isin(['misc_RNA'])]
print(gencode_df.shape)
display(gencode_df.head())

#### load the risk variants of interest

In [None]:
variants_oi_df = pd.read_csv(meta5_st2_clean_file, sep='\t')
print(variants_oi_df.shape)
variants_oi_df.head()

#### load the full gwas summary stats

In [None]:
%%time
gwas_stats_df = pd.read_hdf(meta5_stats_file)
print(gwas_stats_df.shape)
display(gwas_stats_df.head())

#### load the replicated eQTL

In [None]:
eqtl_df = pd.read_csv(eqtl_results_file)
print(eqtl_df.shape)
number_genes = len(eqtl_df['phenotype_id'].unique())
print(f'replicated eqtl for {number_genes} genes')
eqtl_df.head()

#### which day are present in the results (typically will be all in cohort)

In [None]:
days = sorted(list(eqtl_df['day'].unique()))
print(days)

#### grab the max pval_gi

In [None]:
max_nominal_pvalue = eqtl_df[eqtl_df['is_sig'] == 1]['pval_gi'].max()
try:
    max_cnt_pvalue = 0.05/number_genes
except ZeroDivisionError:
    print('number of genes less than one, switching to raw alpha value')
    max_cnt_pvalue = alpha_value
    
print(f'max_nomical_pvalue == {max_nominal_pvalue}')
print(f'max_cnt_pvalue == {max_cnt_pvalue}')

max_pvalue = max_nominal_pvalue if max_nominal_pvalue > max_cnt_pvalue else max_cnt_pvalue
print(f'max nominal pvalue {max_pvalue}')
max_pvalue = 0.001
print(f'max hardcoded pvalue {max_pvalue}')

#### see if any of the top significant results happen to be risk independent variants

In [None]:
def check_eqtls_for_variants(eqtl_df, variants_df, gencode_df):
    eqtl_variants_oi_df = eqtl_df.loc[eqtl_df['variant_id'].isin(variants_df['SNP'])]
    print(eqtl_variants_oi_df.shape)
    this_cnt = len(eqtl_variants_oi_df['variant_id'].unique())
    print(f'variants {this_cnt}')
    print(eqtl_variants_oi_df['variant_id'].unique())
    this_cnt = len(eqtl_variants_oi_df['phenotype_id'].unique())
    print(f'genes {this_cnt}')
    oi_genes = gencode_df.loc[gencode_df['gene_id'].isin(eqtl_variants_oi_df['phenotype_id']), 
                              ['gene_name']]['gene_name'].unique()
    print(oi_genes)
    return eqtl_variants_oi_df

In [None]:
risk_genes = check_eqtls_for_variants(eqtl_df, variants_oi_df, gencode_df)
if len(risk_genes) > 0:
    display(risk_genes.head())

#### now load rest of results and see if risk index variants are eQTL

In [None]:
import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

In [None]:
%%time

fs_list = []
lm_results = []
with concurrent.futures.ProcessPoolExecutor(max_workers=max_threads*2) as ppe:
    for chrom in autosomes:
        fs_list.append(ppe.submit(process_regression_check, chrom, days, 
                                  tensorqtl_dir, cohort, 
                                  interaction_term, gwas_stats_df))
# for future in concurrent.futures.as_completed(fs_list):
#     lm_results.append(future.result())

# # flatten the list
# results_to_keep = pd.concat([item for item in lm_results])

In [None]:
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

# flatten the list
results_to_keep = pd.concat([item for item in lm_results])

In [None]:
print(results_to_keep.shape)
display(results_to_keep.head())

In [None]:
print(len(results_to_keep['phenotype_id'].unique()))
print(results_to_keep['phenotype_id'].unique())

In [None]:
genes_kept  = gencode_df.loc[gencode_df['gene_id'].isin(results_to_keep['phenotype_id']), 
                             ['gene_name']]['gene_name'].unique()
print(len(genes_kept))
print(genes_kept)

In [None]:
import random
phenotype_id = random.choice(results_to_keep['phenotype_id'].unique())
print(phenotype_id)

In [None]:
gencode_df.loc[gencode_df['gene_id'] == phenotype_id, 
               ['gene_name']]['gene_name'].unique()

In [None]:
# phenotype_id = 'ENSG00000143537.13'
# phenotype_id = 'ENSG00000164733.20'
temp = results_to_keep.loc[results_to_keep['phenotype_id'] == phenotype_id]
print(temp.shape)
display(temp.head())

In [None]:
sns.lmplot(x='gwas_tscore_abs', y='eqtl_tscore_abs', hue='day', data=temp)

In [None]:
sns.relplot(x='gwas_tscore_abs', 
            y='eqtl_tscore_abs', 
            hue='day', alpha=.5, palette="dark", 
            height=12, data=temp) 

In [None]:
this_formula = 'gwas_tscore_abs ~ eqtl_tscore_abs'
grouping = 'day'
result = mixed_model(this_formula, temp, grouping)
print(result.summary())

In [None]:
term = 'eqtl_tscore_abs'
print(['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
print(result.params[term], result.bse[term], 
      result.params.shape[0], result.pvalues[term])

In [None]:
this_formula = 'gwas_tscore_abs ~ eqtl_tscore_abs'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_tscore_abs ~ eqtl_tscore_abs + day'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_tscore_abs ~ eqtl_tscore_abs + day + eqtl_tscore_abs * day'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())

In [None]:
temp.head()

In [None]:
temp['daynum'] = temp['day'].str.replace('da','').astype('int32')
this_formula = 'gwas_tscore_abs ~ eqtl_tscore_abs + (1|daynum)'
reg_model = smf.ols(this_formula, data=temp).fit()
print(reg_model.summary())