## Notebook to check the CpG sites indentifed in EWAS of PD/LBD Lewy Body Pathology in FOUNDIN-PD Epigentic Analyses

Pihlstrøm L, Shireby G, Geut H et al. Epigenome-wide association study of human frontal cortex identifies differential methylation in Lewy body pathology. Nat Commun 2022;13:4932.
https://pubmed.ncbi.nlm.nih.gov/35995800/

In [None]:
!date

#### import libraries

In [None]:
import statsmodels.stats.multitest as smm
from pandas import DataFrame, read_parquet, concat, read_csv
import concurrent.futures
from os.path import exists

#### set notebook variables

In [None]:
# naming
cohort = 'foundin'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files
qtl_file_frmt = '{dir}/foundin_{day}_{pair}.cis_qtl_pairs.chr{chrom}.parquet'

# constants
cpg_sites = ['cg07107199', 'cg14511218', 'cg09985192', 'cg04011470']
# meth data is only da0 and da65
days = ['da0', 'da65']
result_pairs = ['ATAC-METH', 'CIRC-METH', 'PDUI-METH', 'RNAB-METH', 'RNAS-METH']
alpha_value = 0.05
DEBUG = False
AUTOSOMES = [str(x) for x in list(range(1,23))]

#### utility functions

In [None]:
# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

def read_qtl_results(in_file: str) -> DataFrame:
    qtl_df = read_parquet(in_file)
    qtl_df['cispair'] = qtl_df['phenotype_id'] + ':' + qtl_df['variant_id']
    return qtl_df

def read_all_qtl_results(day: str, pair: str, verbose: bool=False) -> DataFrame:
    fs_list = []
    lm_results = []
    with concurrent.futures.ThreadPoolExecutor() as tpe:
        for chrom in AUTOSOMES:
            this_result_file = qtl_file_frmt.format(dir=tensorqtl_dir, day=day, 
                                                    pair=pair, chrom=chrom)
            if exists(this_result_file):
                fs_list.append(tpe.submit(read_qtl_results, this_result_file))
    for future in concurrent.futures.as_completed(fs_list):
        lm_results.append(future.result()) 
    # combine the read results
    qtl_df = concat(lm_results)    
    print(f'{pair} qtl results shape {qtl_df.shape}')
    if verbose:
        display(qtl_df.sample(5))
    return qtl_df

### check significant results for CpG sites of interest

for each day in each analysis pairing

In [None]:
%%time
for pairing in result_pairs:
    for day in days:
        print(day, pairing)
        # load results
        results_df = read_all_qtl_results(day, pairing, verbose=DEBUG)
        # apply B&H FDR corrections to results
        results_df['bh_fdr'] = compute_fdr(results_df['pval_nominal'].fillna(1))
        # check significant results for CpG sites of interest
        sig_reuslts = results_df.loc[(results_df.variant_id.isin(cpg_sites)) & (results_df.bh_fdr <= alpha_value)]
        print(sig_reuslts.shape)
        display(sig_reuslts.variant_id.value_counts())
        print(sig_reuslts.cispair.unique())
        if sig_reuslts.shape[0] < 50:
            display(sig_reuslts)
        else:
            display(sig_reuslts.sample(10))        

### check the CpG sites for detected mQTL

In [None]:
for day in days:
    print(day)
    mqtl_df = read_csv(f'{tensorqtl_dir}/foundin_{day}_METH.cis.map.csv')
    sites_mqtl = mqtl_df.loc[mqtl_df.phenotype_id.isin(cpg_sites)]
    display(sites_mqtl.sort_values(['bh_fdr', 'pval_nominal']))

### check the CpG sites correlated with differentiation status

In [None]:
for diff_metric in ['DopaminergicNeurons', 'TH_Pel-Freez_ICC', 'daynum', 'DAn', 'MAP2_Santa_Cruz_ICC']:
    print(diff_metric)
    this_file = f'{results_dir}/{cohort}_METH_{diff_metric}_lmm.csv'
    results = read_csv(this_file)
    display(results.loc[results.feature.isin(cpg_sites)].sort_values(['bh_fdr', 'p-value']))

In [None]:
!date