## Notebook to run *risk*-eQTL analysis using [tensorQTL](https://github.com/broadinstitute/tensorqtl)

ie all Dx risk index variants against all transcripts so not *cis* or *trans* 

[Taylor-Weiner, Aguet, et al., Genome Biol. 20:228, 2019.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7)

In [1]:
!date

Fri Feb 19 05:23:58 UTC 2021


#### import libraries and set notebook variables

In [2]:
import pandas as pd
import torch
import tensorqtl.tensorqtl as tensorqtl
from tensorqtl.tensorqtl import genotypeio, cis, trans
print('PyTorch {}'.format(torch.__version__))
print('Pandas {}'.format(pd.__version__))

import os
import numpy as np
import statsmodels.stats.multitest as smm

import warnings
warnings.filterwarnings('ignore')

# import random
# import threading
# import dask.dataframe as dd

PyTorch 1.7.1
Pandas 1.2.2


In [3]:
# parameters
cohort = 'foundin'
amp_abbr = 'PP'
version = 'amppdv1'
day = 'da65'

In [4]:
# naming
cohort_version = f'{cohort}.{version}'
cohort_build = f'{cohort}.{day}'

# directories
wrk_dir = f'/home/jupyter/{cohort}/eqtl'
geno_dir = f'{wrk_dir}/genotypes'
expr_dir = f'{wrk_dir}/expression'
info_dir = f'{wrk_dir}/sample_info'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files
expr_bed_file = f'{expr_dir}/{cohort_build}.norm.adj.bed.gz'
bfile_prefix_path = f'{geno_dir}/{cohort_version}.risk.bfile'
assay_covs_files = f'{info_dir}/foundin_rnab_sample_info.csv'
# gencode_pkl = f'{expr_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
# pfiles = '{genodir}/{cohortversion}.chr{chr}'
# genome_index_file = f'{expr_dir}/Homo_sapiens_assembly38.fasta.fai'

# output files
# eqtl_psam = f'{info_dir}/{cohort_build}.psam'
# chr_detected_out_file = '{exprdir}/{cohortbuild}.detected.genes.chr{chr}'
# used_samples_list_file = f'{info_dir}/{cohort_version}.eqtl.samples'
risk_results_file = f'{tensorqtl_dir}/{cohort_build}.risk_qtl_pairs.parquet'
risk_cs_results_file = f'{tensorqtl_dir}/{cohort_build}.cs.risk_qtl_pairs.parquet'
sig_risk_cs_results_file = f'{results_dir}/{cohort_build}.cs.risk_qtl_pairs.csv'

# constant values
# autosomes = [str(x) for x in list(range(1,23))]
# max_dist = 1000000
# capture_out = !(nproc)
# max_threads = int(capture_out[0])
alpha_value = 0.05
min_nominal_alpha = 1e-05
# max_feature_cnt_parallel_load = 20000

cell_types_list = ['DopaminergicNeurons', 'EarlyneuronProgenitor',
                   'ImmatureDopaminergicNeurons', 'ProliferatingFloorPlateProgenitors',
                   'LateneuronProgenitor', 'Ependymal-likeCells',
                   'Neuroepithelial-likeCells']
repeated_samples_to_exclude = ['RNAB_PPMI3966B1_2813_da65_v1', 'RNAB_PPMI3966B2_2813_da65_v2',
                               'RNAB_PPMI3966B5_2813_da65_v1']

In [5]:
os.makedirs(tensorqtl_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

#### utility functions

In [6]:
# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

#### load phenotypes and covariates (if needed)

In [None]:
%%time

phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expr_bed_file)
covs_df = pd.read_csv(assay_covs_files, index_col=0)
print(f'phenotype_df {phenotype_df.shape}')
print(f'phenotype_pos_df {phenotype_df.shape}')
print(f'covariates_df {phenotype_df.shape}')
# display(phenotype_df.head())
# display(phenotype_pos_df.head())
# display(covs_df.head())

#### load plink bfiles

In [8]:
%%time

# pr = genotypeio.PlinkReader(bfile_prefix_path, select_samples=phenotype_df.columns)
pr = genotypeio.PlinkReader(bfile_prefix_path)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

Mapping files: 100%|██████████| 3/3 [00:00<00:00, 160.51it/s]

CPU times: user 44.8 ms, sys: 1.12 ms, total: 45.9 ms
Wall time: 57.9 ms





In [None]:
print(genotype_df.shape)
# display(genotype_df.head())
print(variant_df.shape)
# display(variant_df.head())

In [10]:
# tensorQTL says wants plink bfiles, but wants bim chrs to include 'chr'
variant_df['chrom'] = 'chr' + variant_df['chrom']
print(variant_df.shape)
display(variant_df.head())

(104, 2)


Unnamed: 0_level_0,chrom,pos
snp,Unnamed: 1_level_1,Unnamed: 2_level_1
rs76763715,chr1,155235843
rs6658353,chr1,161499264
rs11578699,chr1,171750629
rs6676110,chr1,205693849
rs823118,chr1,205754444


#### make sure the pheno and genos have same samples

In [11]:
assay_intersect_samples = set(genotype_df.columns) & set(phenotype_df.columns) 
print(f'intersect {len(assay_intersect_samples)}')
extra_geno_samples = set(genotype_df.columns) - set(phenotype_df.columns)
print(f'number of genotypes samples not in expression {len(extra_geno_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(genotype_df.columns)
print(f'number of expression samples not in genotypes {len(extra_geno_samples)}')

intersect 91
number of genotypes samples not in expression 28
number of expression samples not in genotypes 28


#### drop the non-matched samples

In [None]:
genotype_df.drop(columns=extra_geno_samples, inplace=True)
phenotype_df.drop(columns=extra_expr_samples, inplace=True)

print(genotype_df.shape)
# display(genotype_df.head())
print(phenotype_df.shape)
# display(phenotype_df.head())

#### need to make sure phenos and genos have matched chromosomes; ie just autosomes

In [13]:
# need to ditch any non-autosomal genes
assay_intersect_chroms = set(phenotype_pos_df['chr']) & set(variant_df['chrom']) 
print(f'intersect {len(assay_intersect_chroms)}')
extra_geno_chroms = set(variant_df['chrom']) - set(phenotype_pos_df['chr'])
print(f'number of genotypes chroms not in expression {len(extra_geno_chroms)}')
print(extra_geno_chroms)
extra_expr_chroms = set(phenotype_pos_df['chr']) - set(variant_df['chrom'])
print(f'number of expression chroms not in genotypes {len(extra_expr_chroms)}')
print(extra_expr_chroms)

intersect 21
number of genotypes chroms not in expression 0
set()
number of expression chroms not in genotypes 12
{'KI270713.1', 'GL000195.1', 'chrX', 'chr22', 'KI270733.1', 'KI270711.1', 'KI270728.1', 'GL000219.1', 'KI270734.1', 'chrM', 'KI270727.1', 'chrY'}


In [None]:
if len(extra_geno_chroms) > 0:
    variant_df = variant_df.loc[~variant_df['chrom'].isin(extra_geno_chroms)]
    # this will remove variants so need to remove them from genos df as well
    genotype_df = genotype_df.loc[genotype_df.index.isin(variant_df.index)]
if len(extra_expr_chroms) > 0:
    phenotype_pos_df = phenotype_pos_df.loc[~phenotype_pos_df['chr'].isin(extra_expr_chroms)]
    # this will remove genes so need to remove them from phenos df as well
    phenotype_df = phenotype_df.loc[phenotype_df.index.isin(phenotype_pos_df.index)]

print(genotype_df.shape)
# display(genotype_df.head())
print(variant_df.shape)
# display(variant_df.head())
print(phenotype_df.shape)
# display(phenotype_df.head())
print(phenotype_pos_df.shape)
# display(phenotype_pos_df.head())

#### make sure covariates match geno and pheno samples

In [15]:
# subest covs to just this 'day'; ie all differention days covs in file
# also since only interested in cell fractions as interaction terms, subset now
covs_df = covs_df.loc[(covs_df['day'] == day) & (covs_df['sampleid'].isin(phenotype_df.columns))]
covs_df = covs_df.loc[~covs_df.index.isin(repeated_samples_to_exclude)]
print(covs_df.shape)

cov_intersect_samples = set(phenotype_df.columns) & set(covs_df['sampleid']) 
print(f'intersect {len(cov_intersect_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(covs_df['sampleid'])
print(f'number of expression samples not in covariates {len(extra_expr_samples)}')
extra_cov_samples = set(covs_df['sampleid']) - set(phenotype_df.columns)
print(f'number of covariate samples not in genotypes {len(extra_cov_samples)}')

(91, 98)
intersect 91
number of expression samples not in covariates 0
number of covariate samples not in genotypes 0


#### *risk*-QTL: nominal p-values for all risk variant-phenotype pairs

In [16]:
%%time
# run mapping
trans_df = trans.map_trans(genotype_df, phenotype_df, batch_size=10000, return_r2=True,
                           return_sparse=True, pval_threshold=1, maf_threshold=0.01)
print(trans_df.shape)
display(trans_df.head())

trans-QTL mapping
  * 91 samples
  * 30944 phenotypes
  * 104 variants
    processing batch 1/1
    elapsed time: 0.02 min
  * 104 variants passed MAF >= 0.01 filtering
done.
(3218176, 7)


Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,r2,maf
0,rs76763715,ENSG00000116191.17,0.914136,-0.028271,0.261459,0.000131,0.10989
1,rs76763715,ENSG00000138772.12,0.326212,0.253483,0.256765,0.010832,0.10989
2,rs76763715,lnc_RPL18_3,0.255686,-0.274059,0.23956,0.014492,0.10989
3,rs76763715,lnc_SLC16A4_3,0.976111,0.007849,0.261382,1e-05,0.10989
4,rs76763715,ENSG00000137709.9,0.899067,-0.032036,0.251854,0.000182,0.10989


CPU times: user 6.46 s, sys: 1.53 s, total: 7.99 s
Wall time: 6.59 s


#### compute the FDR

In [17]:
# add the corrected p-value, note just based on all chrom gene pvalues    
trans_df['bh_fdr'] = compute_fdr(trans_df['pval'].fillna(1))

In [18]:
print(trans_df.shape)
display(trans_df.head())

(3218176, 8)


Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,r2,maf,bh_fdr
0,rs76763715,ENSG00000116191.17,0.914136,-0.028271,0.261459,0.000131,0.10989,0.999955
1,rs76763715,ENSG00000138772.12,0.326212,0.253483,0.256765,0.010832,0.10989,0.999632
2,rs76763715,lnc_RPL18_3,0.255686,-0.274059,0.23956,0.014492,0.10989,0.998207
3,rs76763715,lnc_SLC16A4_3,0.976111,0.007849,0.261382,1e-05,0.10989,0.999998
4,rs76763715,ENSG00000137709.9,0.899067,-0.032036,0.251854,0.000182,0.10989,0.999955


In [19]:
trans_df['bh_fdr'].describe()

count    3.218176e+06
mean     9.955343e-01
std      1.445357e-02
min      1.445746e-05
25%      9.980010e-01
50%      9.999552e-01
75%      9.999552e-01
max      1.000000e+00
Name: bh_fdr, dtype: float64

In [20]:
# add physical positions onto results
trans_df = trans_df.merge(variant_df, how='left', 
                          left_on='variant_id', right_on='snp')
trans_df = trans_df.merge(phenotype_pos_df, how='left', 
                          left_on='phenotype_id', right_on='phenotype_id')
trans_df['onchrom'] = np.where(trans_df['chrom'] == trans_df['chr'], 
                               'on', 'between')
trans_df['dist'] = trans_df['tss'] - trans_df['pos']
trans_df.loc[trans_df['onchrom'] == 'between', 'dist'] = np.nan

print(trans_df.shape)
display(trans_df.head())

(3218176, 14)


Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,r2,maf,bh_fdr,chrom,pos,chr,tss,onchrom,dist
0,rs76763715,ENSG00000116191.17,0.914136,-0.028271,0.261459,0.000131,0.10989,0.999955,chr1,155235843,chr1,178725147,on,23489304.0
1,rs76763715,ENSG00000138772.12,0.326212,0.253483,0.256765,0.010832,0.10989,0.999632,chr1,155235843,chr4,78551519,between,
2,rs76763715,lnc_RPL18_3,0.255686,-0.274059,0.23956,0.014492,0.10989,0.998207,chr1,155235843,chr19,48624132,between,
3,rs76763715,lnc_SLC16A4_3,0.976111,0.007849,0.261382,1e-05,0.10989,0.999998,chr1,155235843,chr1,110338737,on,-44897106.0
4,rs76763715,ENSG00000137709.9,0.899067,-0.032036,0.251854,0.000182,0.10989,0.999955,chr1,155235843,chr11,120236640,between,


In [21]:
sig_risk_df = trans_df.loc[trans_df['bh_fdr'] <= alpha_value]
print(sig_risk_df.shape)
display(sig_risk_df.head())

(21, 14)


Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,r2,maf,bh_fdr,chrom,pos,chr,tss,onchrom,dist
138444,rs823118,lnc_RHEX_8,4.879842e-08,-0.798306,0.133847,0.285561,0.362637,0.008725,chr1,205754444,chr1,205862522,on,108078.0
417630,rs73038319,lnc_C1QTNF3_AMACR_3,4.492441e-12,-3.231971,0.404354,0.41787,0.021978,1.4e-05,chr3,18320267,chr5,34190204,between,
1103755,rs26431,ENSG00000175749.11,1.05966e-07,0.840778,0.145314,0.273335,0.241758,0.017051,chr5,103030090,chr5,103032376,on,2286.0
1415204,rs199351,KLHL7_DT,6.943582e-09,0.816349,0.127466,0.315474,0.445055,0.00149,chr7,23260430,chr7,23105703,on,-154727.0
2786097,rs142167,lnc_PLEKHM1_3,1.082803e-10,-1.052603,0.143912,0.375429,0.296703,5.8e-05,chr17,46717868,chr17,45520464,on,-1197404.0


In [22]:
display(sig_risk_df)

Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,r2,maf,bh_fdr,chrom,pos,chr,tss,onchrom,dist
138444,rs823118,lnc_RHEX_8,4.879842e-08,-0.798306,0.133847,0.285561,0.362637,0.008725,chr1,205754444,chr1,205862522,on,108078.0
417630,rs73038319,lnc_C1QTNF3_AMACR_3,4.492441e-12,-3.231971,0.404354,0.41787,0.021978,1.4e-05,chr3,18320267,chr5,34190204,between,
1103755,rs26431,ENSG00000175749.11,1.05966e-07,0.840778,0.145314,0.273335,0.241758,0.017051,chr5,103030090,chr5,103032376,on,2286.0
1415204,rs199351,KLHL7_DT,6.943582e-09,0.816349,0.127466,0.315474,0.445055,0.00149,chr7,23260430,chr7,23105703,on,-154727.0
2786097,rs142167,lnc_PLEKHM1_3,1.082803e-10,-1.052603,0.143912,0.375429,0.296703,5.8e-05,chr17,46717868,chr17,45520464,on,-1197404.0
2796934,rs142167,lnc_PLEKHM1_4,3.646967e-09,-0.957676,0.146265,0.325094,0.296703,0.000978,chr17,46717868,chr17,45516128,on,-1201740.0
2801832,rs142167,lnc_LINC02210_CRHR1_7,1.05668e-09,-1.011352,0.148288,0.343246,0.296703,0.00034,chr17,46717868,chr17,45585004,on,-1132864.0
2807509,rs142167,KANSL1_AS1,1.697743e-07,0.887911,0.15642,0.265811,0.296703,0.026017,chr17,46717868,chr17,46193573,on,-524295.0
2809678,rs142167,ENSG00000238083.7,9.44763e-12,1.094144,0.139643,0.408212,0.296703,1.5e-05,chr17,46717868,chr17,46511511,on,-206357.0
2809708,rs142167,ENSG00000214425.7,8.185168e-11,-1.053283,0.142831,0.379276,0.296703,5.3e-05,chr17,46717868,chr17,45550335,on,-1167533.0


In [23]:
sig_risk_df['dist'].abs().describe()

count    2.000000e+01
mean     8.392539e+05
std      4.957181e+05
min      2.286000e+03
25%      2.757890e+05
50%      1.182468e+06
75%      1.235419e+06
max      1.271945e+06
Name: dist, dtype: float64

#### save the results

In [24]:
trans_df.to_parquet(risk_results_file, index=False)

#### run the cell-type fraction interactions

In [None]:

# subset just the cell fraction covariates
keep_cols = cell_types_list + ['sampleid']
cf_covs_df = covs_df[keep_cols]
cf_covs_df.set_index('sampleid', inplace=True)
print(cf_covs_df.shape)
# display(cf_covs_df.head())

for cell_type in cell_types_list:
    cf_covs_df[cell_type].fillna(cf_covs_df[cell_type].mean(), inplace=True)

In [26]:
%%time

# run mapping with interactions for cell types
cells_df = None
for cell_type in cell_types_list:
    print(cell_type)
    trans_df = trans.map_trans(genotype_df, phenotype_df, 
                               covariates_df=pd.DataFrame(index=phenotype_df.columns), 
                               batch_size=10000, return_r2=True,
                               return_sparse=True, pval_threshold=1, maf_threshold=0.01, 
                               interaction_s=cf_covs_df[cell_type])
    
    # add the corrected p-value, note just based on all chrom gene pvalues    
    trans_df['bh_fdr'] = compute_fdr(trans_df['pval_gi'].fillna(1))
    # add physical positions onto results
    trans_df = trans_df.merge(variant_df, how='left', 
                              left_on='variant_id', right_on='snp')
    trans_df = trans_df.merge(phenotype_pos_df, how='left', 
                              left_on='phenotype_id', right_on='phenotype_id')
    trans_df['onchrom'] = np.where(trans_df['chrom'] == trans_df['chr'], 
                                   'on', 'between')
    trans_df['dist'] = trans_df['tss'] - trans_df['pos']
    trans_df.loc[trans_df['onchrom'] == 'between', 'dist'] = np.nan
    trans_df['celltype'] = cell_type
    cells_df = pd.concat([cells_df, trans_df])
    print(cells_df.shape)

DopaminergicNeurons
trans-QTL mapping
  * 91 samples
  * 30944 phenotypes
  * 0 covariates
  * 104 variants
  * including interaction term
    processing batch 1/1
    time elapsed: 0.01 min
(3187232, 14)
EarlyneuronProgenitor
trans-QTL mapping
  * 91 samples
  * 30944 phenotypes
  * 0 covariates
  * 104 variants
  * including interaction term
    processing batch 1/1
    time elapsed: 0.00 min
(6374464, 14)
ImmatureDopaminergicNeurons
trans-QTL mapping
  * 91 samples
  * 30944 phenotypes
  * 0 covariates
  * 104 variants
  * including interaction term
    processing batch 1/1
    time elapsed: 0.00 min
(9561696, 14)
ProliferatingFloorPlateProgenitors
trans-QTL mapping
  * 91 samples
  * 30944 phenotypes
  * 0 covariates
  * 104 variants
  * including interaction term
    processing batch 1/1
    time elapsed: 0.00 min
(12779872, 14)
LateneuronProgenitor
trans-QTL mapping
  * 91 samples
  * 30944 phenotypes
  * 0 covariates
  * 104 variants
  * including interaction term
    processing

In [27]:
# save the results
cells_df.to_parquet(risk_cs_results_file, index=False)

In [28]:
cells_df.loc[cells_df['pval_gi'] <= min_nominal_alpha]['phenotype_id'].unique().shape

(6255,)

In [29]:
sig_risk_df = cells_df.loc[cells_df['bh_fdr'] <= alpha_value]
# sig_risk_df = cells_df.loc[cells_df['pval_gi'] <= min_nominal_alpha]
print(sig_risk_df.shape)
display(sig_risk_df.head())

(38823, 14)


Unnamed: 0,variant_id,phenotype_id,pval_g,pval_i,pval_gi,maf,bh_fdr,chrom,pos,chr,tss,onchrom,dist,celltype
1007,rs76763715,lnc_ZNF460_2,0.011791,0.318458,1.4e-05,0.10989,0.015098,chr1,155235843,chr19,57304305,between,,DopaminergicNeurons
2606,rs76763715,lnc_SLC25A51_3,0.005231,0.037849,7e-05,0.10989,0.035248,chr1,155235843,chr9,37872760,between,,DopaminergicNeurons
2699,rs76763715,ENSG00000213642.3,0.001212,0.352339,2e-05,0.10989,0.018825,chr1,155235843,chr7,64570437,between,,DopaminergicNeurons
3411,rs76763715,ENSG00000207185.1,0.000825,0.415416,0.000107,0.10989,0.043576,chr1,155235843,chr11,118593988,between,,DopaminergicNeurons
6328,rs76763715,ENSG00000258643.5,0.004346,0.091108,3.3e-05,0.10989,0.023881,chr1,155235843,chr14,23306835,between,,DopaminergicNeurons


In [30]:
sig_risk_df['pval_gi'].describe()

count    3.882300e+04
mean     6.916414e-05
std      7.752032e-05
min      4.462828e-18
25%      9.183776e-06
50%      3.778521e-05
75%      1.031995e-04
max      3.019048e-04
Name: pval_gi, dtype: float64

In [31]:
sig_risk_df['dist'].abs().describe()

count    2.133000e+03
mean     5.195420e+07
std      4.392311e+07
min      2.070100e+04
25%      1.569256e+07
50%      3.866040e+07
75%      7.946684e+07
max      1.902235e+08
Name: dist, dtype: float64

In [32]:
cells_df['celltype'].value_counts()

ProliferatingFloorPlateProgenitors    3218176
EarlyneuronProgenitor                 3187232
Neuroepithelial-likeCells             3187232
DopaminergicNeurons                   3187232
Ependymal-likeCells                   3187232
LateneuronProgenitor                  3187232
ImmatureDopaminergicNeurons           3187232
Name: celltype, dtype: int64

In [33]:
sig_risk_df['celltype'].value_counts()

ProliferatingFloorPlateProgenitors    19432
DopaminergicNeurons                    8965
Neuroepithelial-likeCells              5909
EarlyneuronProgenitor                  2418
LateneuronProgenitor                   1622
Ependymal-likeCells                     253
ImmatureDopaminergicNeurons             224
Name: celltype, dtype: int64

#### save the significant results

In [34]:
sig_risk_df.to_csv(sig_risk_cs_results_file, index=False)