## Notebook to check the CpG sites indentifed in EWAS of PD/LBD Lewy Body Pathology in FOUNDIN-PD Epigentic Analyses

Pihlstrøm L, Shireby G, Geut H et al. Epigenome-wide association study of human frontal cortex identifies differential methylation in Lewy body pathology. Nat Commun 2022;13:4932.
https://pubmed.ncbi.nlm.nih.gov/35995800/

In [1]:
!date

Tue Feb 28 16:17:43 UTC 2023


#### import libraries

In [2]:
import statsmodels.stats.multitest as smm
from pandas import DataFrame, read_parquet, concat
import concurrent.futures
from os.path import exists

#### set notebook variables

In [3]:
# naming

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'

# input files
qtl_file_frmt = '{dir}/foundin_{day}_{pair}.cis_qtl_pairs.chr{chrom}.parquet'

# constants
cpg_sites = ['cg07107199', 'cg14511218', 'cg09985192', 'cg04011470']
# meth data is only da0 and da65
days = ['da0', 'da65']
result_pairs = ['ATAC-METH', 'CIRC-METH', 'PDUI-METH', 'RNAB-METH', 'RNAS-METH']
alpha_value = 0.05
DEBUG = True
AUTOSOMES = [str(x) for x in list(range(1,23))]

#### utility functions

In [4]:
# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

def read_qtl_results(in_file: str) -> DataFrame:
    qtl_df = read_parquet(in_file)
    qtl_df['cispair'] = qtl_df['phenotype_id'] + ':' + qtl_df['variant_id']
    return qtl_df

def read_all_qtl_results(day: str, pair: str, verbose: bool=False) -> DataFrame:
    fs_list = []
    lm_results = []
    with concurrent.futures.ThreadPoolExecutor() as tpe:
        for chrom in AUTOSOMES:
            this_result_file = qtl_file_frmt.format(dir=tensorqtl_dir, day=day, 
                                                    pair=pair, chrom=chrom)
            if exists(this_result_file):
                fs_list.append(tpe.submit(read_qtl_results, this_result_file))
    for future in concurrent.futures.as_completed(fs_list):
        lm_results.append(future.result()) 
    # combine the read results
    qtl_df = concat(lm_results)    
    print(f'{pair} qtl results shape {qtl_df.shape}')
    if verbose:
        display(qtl_df.sample(5))
    return qtl_df

### check significant results for CpG sites of interest

for each day in each analysis pairing

In [5]:
%%time
for pairing in result_pairs:
    for day in days:
        print(day, pairing)
        # load results
        results_df = read_all_qtl_results(day, pairing, verbose=DEBUG)
        # apply B&H FDR corrections to results
        results_df['bh_fdr'] = compute_fdr(results_df['pval_nominal'].fillna(1))
        # check significant results for CpG sites of interest
        sig_reuslts = results_df.loc[(results_df.variant_id.isin(cpg_sites)) & (results_df.bh_fdr <= alpha_value)]
        print(sig_reuslts.shape)
        display(sig_reuslts.variant_id.value_counts())
        print(sig_reuslts.cispair.unique())
        if DEBUG:
            if sig_reuslts.shape[0] < 50:
                display(sig_reuslts)
            else:
                display(sig_reuslts.sample(10))        

da0 ATAC-METH
ATAC-METH qtl results shape (121619951, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
2768922,chr20_48143074_48144210,cg06398873,-649424,0.251038,48,28,0.108796,-0.176782,0.109203,chr20_48143074_48144210:cg06398873
1247808,chr3_156174326_156174817,cg05511503,-429124,0.237217,34,20,0.739332,0.0336,0.100685,chr3_156174326_156174817:cg05511503
3656805,chr6_3211056_3212337,cg00101728,-258263,0.234398,32,18,0.808074,-0.023447,0.096258,chr6_3211056_3212337:cg00101728
5017117,chr8_17805735_17806329,cg18736775,585404,0.254101,50,29,0.358748,0.114971,0.124667,chr8_17805735_17806329:cg18736775
5662289,chr6_35341430_35343683,cg08207604,-43899,0.238253,44,25,0.126059,-0.155297,0.10062,chr6_35341430_35343683:cg08207604


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


da65 ATAC-METH
ATAC-METH qtl results shape (125327849, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
3579260,chr12_132401235_132402231,cg09075307,-338535,0.237423,25,14,0.013532,0.375039,0.148421,chr12_132401235_132402231:cg09075307
848152,chr18_31579251_31580089,cg16831596,-237028,0.23817,33,18,0.77901,-0.028008,0.099466,chr18_31579251_31580089:cg16831596
742595,chr20_46814543_46815416,cg09704054,-990021,0.243005,40,22,0.106499,-0.193143,0.118275,chr20_46814543_46815416:cg09704054
215362,chr5_67878886_67880012,cg02876332,-875377,0.271163,52,31,0.067797,0.198754,0.107314,chr5_67878886_67880012:cg02876332
3464356,chr16_86772132_86773680,cg02312706,578861,0.24386,36,21,0.550397,0.060363,0.100643,chr16_86772132_86773680:cg02312706


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


da0 CIRC-METH
CIRC-METH qtl results shape (323588, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
5841,chr13:45985297|46020557,cg20268292,-493839,0.364777,68,50,0.693617,-0.075849,0.191718,chr13:45985297|46020557:cg20268292
3079,chr18:9182382|9221999,cg13287668,-463783,0.302823,58,37,0.642639,-0.047131,0.10112,chr18:9182382|9221999:cg13287668
22519,chr1:114463104|114464388,cg09229169,-413342,0.249624,36,21,0.258581,0.130205,0.114287,chr1:114463104|114464388:cg09229169
6298,chr7:138519189|138551180,cg00733782,-59250,0.278342,50,30,0.847137,-0.03619,0.187021,chr7:138519189|138551180:cg00733782
527,chr16:81351583|81377702,cg17621960,296036,0.260909,43,25,0.929868,0.017735,0.200763,chr16:81351583|81377702:cg17621960


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


da65 CIRC-METH
CIRC-METH qtl results shape (751605, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
18460,chr10:7243558|7285954,cg20292154,-56690,0.23129,21,11,0.159772,-0.327799,0.230278,chr10:7243558|7285954:cg20292154
1166,chr3:134158121|134195182,cg03745859,-524605,0.248481,35,19,0.460784,-0.114423,0.154141,chr3:134158121|134195182:cg03745859
36816,chr11:74132844|74133557,cg05508864,503677,0.251705,30,17,0.234949,0.26692,0.222478,chr11:74132844|74133557:cg05508864
39002,chr3:167554650|167627292,cg03237828,-106335,0.25366,35,19,0.413655,0.213547,0.259412,chr3:167554650|167627292:cg03237828
62648,chr1:1815756|1839238,cg22127114,693075,0.258643,32,18,0.478897,-0.157039,0.220396,chr1:1815756|1839238:cg22127114


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


da0 PDUI-METH
PDUI-METH qtl results shape (30091302, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
2735267,chr1:1635263-1635856,cg02050917,606869,0.236055,36,20,0.419,-0.093774,0.115508,chr1:1635263-1635856:cg02050917
2809696,chr1:156699618-156700076,cg10167065,74321,0.243802,42,24,0.032222,0.203518,0.093571,chr1:156699618-156700076:cg10167065
472883,chr9:35095144-35095566,cg13842293,763949,0.251082,44,26,0.656686,0.094396,0.211668,chr9:35095144-35095566:cg13842293
898292,chr3:72749277-72750836,ch.3.72737075R,-144043,0.245101,41,23,0.508978,0.075686,0.114149,chr3:72749277-72750836:ch.3.72737075R
2880660,chr1:155236182-155236469,cg01588776,675317,0.256639,50,29,0.043351,-0.217065,0.105942,chr1:155236182-155236469:cg01588776


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


da65 PDUI-METH
PDUI-METH qtl results shape (32439606, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
117839,chr19:11342778-11343120,cg26616731,89933,0.253744,44,25,0.84645,0.030957,0.159332,chr19:11342778-11343120:cg26616731
2500934,chr17:80039812-80040535,cg15710638,-97654,0.243235,35,20,0.152942,-0.172645,0.119619,chr17:80039812-80040535:cg15710638
150867,chr15:77046030-77046927,cg08806156,574771,0.316538,65,44,0.041742,-0.398882,0.192677,chr15:77046030-77046927:cg08806156
500245,chr3:105658283-105659229,cg03220206,211217,0.26333,50,29,0.024906,-0.257249,0.112482,chr3:105658283-105659229:cg03220206
2568832,chr17:81868631-81868943,cg06380072,-147827,0.312621,73,47,0.033299,-0.19493,0.08996,chr17:81868631-81868943:cg06380072


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


da0 RNAB-METH
RNAB-METH qtl results shape (24383455, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
1073128,CSTF3-DT,cg08039343,35117,0.249932,45,26,0.001082,0.378022,0.111964,CSTF3-DT:cg08039343
420134,lnc-ERG28-2,cg19900098,63867,0.267439,55,33,0.977835,0.002862,0.102725,lnc-ERG28-2:cg19900098
533658,ENSG00000135974.9,cg26154135,767449,0.249336,43,26,0.382577,-0.082846,0.094422,ENSG00000135974.9:cg26154135
53209,ENSG00000200257.1,cg02434996,-930428,0.253238,47,27,0.901731,0.014651,0.118326,ENSG00000200257.1:cg02434996
2409345,lnc-KYAT3-1,cg01912331,280064,0.264573,67,38,0.72741,0.036568,0.104585,lnc-KYAT3-1:cg01912331


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


da65 RNAB-METH
RNAB-METH qtl results shape (24132201, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
49693,ENSG00000160190.13,cg03841136,-779333,0.25111,39,22,0.466727,-0.083912,0.114728,ENSG00000160190.13:cg03841136
48592,ENSG00000254702.1,cg09262961,928694,0.342619,75,52,0.025252,-0.209942,0.092023,ENSG00000254702.1:cg09262961
783920,lnc-HLA-DMA-1,cg24736274,928596,0.243352,35,20,0.143809,-0.16276,0.110226,lnc-HLA-DMA-1:cg24736274
65413,ENSG00000207342.1,cg06889165,736071,0.255742,46,26,0.574811,-0.063535,0.11278,ENSG00000207342.1:cg06889165
2421268,ENSG00000163041.9,cg17913196,45810,0.225421,26,14,0.232445,0.137172,0.113983,ENSG00000163041.9:cg17913196


(5, 11)


cg09985192    3
cg07107199    1
cg14511218    1
Name: variant_id, dtype: int64

['ENSG00000186007.9:cg07107199' 'ENSG00000123243.14:cg14511218'
 'ENSG00000151322.18:cg09985192' 'lnc-GPR33-5:cg09985192'
 'ENSG00000100852.12:cg09985192']


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr
747232,ENSG00000186007.9,cg07107199,-134595,0.267125,57,33,1.4e-05,0.412816,0.088962,ENSG00000186007.9:cg07107199,0.005827
520497,ENSG00000123243.14,cg14511218,-638389,0.255092,42,24,0.000253,-0.346786,0.090414,ENSG00000123243.14:cg14511218,0.0353
297071,ENSG00000151322.18,cg09985192,-606884,0.307589,63,41,3.8e-05,-0.392687,0.089812,ENSG00000151322.18:cg09985192,0.0111
479549,lnc-GPR33-5,cg09985192,127551,0.307589,63,41,5.4e-05,-0.441287,0.103312,lnc-GPR33-5:cg09985192,0.013992
559522,ENSG00000100852.12,cg09985192,251935,0.307589,63,41,4e-05,-0.408532,0.093786,ENSG00000100852.12:cg09985192,0.011537


da0 RNAS-METH
RNAS-METH qtl results shape (1366682, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
91977,tRNA-Pro-CGG-1-3,cg06826802,-982808,0.270941,63,38,0.214742,0.105342,0.084317,tRNA-Pro-CGG-1-3:cg06826802
3472,hsa-miR-376a-2-5p,cg00941900,-148816,0.257578,51,29,0.379502,0.140862,0.159506,hsa-miR-376a-2-5p:cg00941900
53933,tRNA-Arg-TCG-5-1,cg10372721,-27798,0.225036,26,14,0.398531,-0.092153,0.108641,tRNA-Arg-TCG-5-1:cg10372721
88469,hsa-miR-6511b-3p,cg12227352,-118623,0.271356,68,39,0.255762,0.116499,0.101865,hsa-miR-6511b-3p:cg12227352
3191,hsa-miR-590-3p,cg09398264,-206258,0.209952,12,7,0.609875,-0.051838,0.101242,hsa-miR-590-3p:cg09398264


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


da65 RNAS-METH
RNAS-METH qtl results shape (1522591, 10)


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair
54169,tRNA-Asp-GTC-2-7,cg00667898,363445,0.23988,36,20,0.834892,0.023049,0.110213,tRNA-Asp-GTC-2-7:cg00667898
20928,tRNA-Arg-TCT-5-1,cg23078194,131604,0.254118,42,24,0.997859,0.000281,0.104324,tRNA-Arg-TCT-5-1:cg23078194
64673,hsa-miR-150-5p,cg04354271,-991678,0.249693,43,24,0.826529,-0.023224,0.105616,hsa-miR-150-5p:cg04354271
13409,hsa-miR-376a-3p,cg01013868,-56616,0.248034,36,21,0.765121,-0.033889,0.113035,hsa-miR-376a-3p:cg01013868
229511,tRNA-Ser-AGA-2-1,cg06743033,221489,0.31974,69,46,0.812342,-0.018919,0.079423,tRNA-Ser-AGA-2-1:cg06743033


(0, 11)


Series([], Name: variant_id, dtype: int64)

[]


Unnamed: 0,phenotype_id,variant_id,tss_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cispair,bh_fdr


CPU times: user 8min 40s, sys: 2min 52s, total: 11min 32s
Wall time: 1h 2min


### check that a single ATAC site for correlation with RNAB

In [7]:
# %%time
# this_result_file = qtl_file_frmt.format(dir=tensorqtl_dir, day='da65', 
#                                         pair='RNAB-ATAC', chrom=1)
# temp_df = read_parquet(this_result_file)
# print(temp_df.shape)
# probe_results = temp_df.loc[temp_df.variant_id == 'chr1_205042708_205045441']
# print(probe_results.shape)
# display(probe_results.sort_values('pval_nominal'))

In [8]:
!date

Tue Feb 28 20:33:06 UTC 2023
