## Notebooks to scan *cis* modalality correlations for exongenous features containing GWAS risk variants

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_parquet, DataFrame, concat
import concurrent.futures

#### set notebooks variables

In [None]:
# parameters
day = ''
exogenous = ''
endogenous = ''

In [None]:
# naming
cohort = 'foundin'
set_name = f'{cohort}_{day}_{endogenous}-{exogenous}'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files
risk_features_file = f'{quants_dir}/{cohort}_risk_peaks.bed'
top_cis_results_files = f'{tensorqtl_dir}/{set_name}.cis.map.csv'

# out files
results_file = f'{results_dir}/{set_name}.risk.cis.csv'

# variables
DEBUG = False
alpha_value = 0.05

if DEBUG:
    print(f'risk_features_file = {risk_features_file}')
    print(f'top_cis_results_files = {top_cis_results_files}')
    print(f'results_file = {results_file}')

### load the input data

#### load the risk features

In [None]:
risk_feats_df = read_csv(risk_features_file)
print(f'risk features shape: {risk_feats_df.shape}')
print(f'number of unique risk features: {risk_feats_df.name.nunique()}')

# get the unique features names, the bed has duplicate names covering regions
risk_features = risk_feats_df.name.unique()

if DEBUG:
    display(risk_feats_df.head())

#### load the *cis* correlations top results

In [None]:
top_results_df = read_csv(top_cis_results_files)
print(f'shape of top cis results: {top_results_df.shape}')

if DEBUG:
    display(top_results_df.head())

### approximate a max nominmal p-value from the the full FDR results

In [None]:
max_pvalue = top_results_df.loc[top_results_df.bh_fdr < alpha_value]['pval_nominal'].max()
print(max_pvalue)

# lower threshold to any where single feature pval_perm is suggestive
max_pvalue = top_results_df.loc[top_results_df.pval_perm < alpha_value]['pval_nominal'].max()
print(max_pvalue)

max_pvalue = 0.01
print(f'hardcoded max p-value: {max_pvalue}')

### see if any top suggestive results happen to be risk features

In [None]:
risk_results_df = top_results_df.loc[(top_results_df.variant_id.isin(risk_features)) & 
                                     (top_results_df.pval_perm < alpha_value)]
print(risk_results_df.shape)

if DEBUG:
    display(risk_results_df.head())
    display(risk_results_df.bh_fdr.describe())

In [None]:
risk_results_df[['phenotype_id', 'variant_id']]

### now load rest of results and see if risk features are correlated

In [None]:
import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

In [None]:
def load_chrom_result(chrom, in_dir, name, features, pval: float) -> DataFrame:
    # have to do pass to find all phenos to possible capture
    chrom_file = f'{in_dir}/{name}.cis_qtl_pairs.{chrom}.parquet'
    chrom_results_df = read_parquet(chrom_file)
    oi_chrom_results_df = chrom_results_df.loc[chrom_results_df.pval_nominal < pval]
    oi_results = oi_chrom_results_df.loc[oi_chrom_results_df.variant_id.isin(features)]

    return oi_results

In [None]:
%%time

risk_chroms = risk_feats_df.chrom.unique()
fs_list = []
lm_results = []
with concurrent.futures.ProcessPoolExecutor() as ppe:
    for chrom in risk_chroms:
        fs_list.append(ppe.submit(load_chrom_result, chrom, 
                                  tensorqtl_dir, set_name, 
                                  risk_features, max_pvalue))

In [None]:
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

# flatten the list
results_to_keep = concat([item for item in lm_results])

#### summary counts from possible risk results

In [None]:
print(f'results shape: {results_to_keep.shape}')
print(f'number unique endogenous features: {results_to_keep.phenotype_id.nunique()}')
print(f'number unique exogenous features: {results_to_keep.variant_id.nunique()}')
if DEBUG:
    display(results_to_keep.head())
    print(results_to_keep.phenotype_id.unique())
    print(results_to_keep.variant_id.unique())

### save the results

In [None]:
results_to_keep.to_csv(results_file, index=False)