## Notebook to run *cis*-eQTL analysis using [tensorQTL](https://github.com/broadinstitute/tensorqtl)

[Taylor-Weiner, Aguet, et al., Genome Biol. 20:228, 2019.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7)

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import torch
import tensorqtl.tensorqtl as tensorqtl
from tensorqtl.tensorqtl import genotypeio, cis, trans
print('PyTorch {}'.format(torch.__version__))
print('Pandas {}'.format(pd.__version__))

import os
import statsmodels.stats.multitest as smm

import warnings
warnings.filterwarnings('ignore')

In [None]:
# parameters
cohort = 'foundin'
version = 'amppdv1'
day = 'da65'

In [None]:
# naming
cohort_version = f'{cohort}.{version}'
cohort_build = f'{cohort}.{day}'

# directories
wrk_dir = f'/home/jupyter/{cohort}/eqtl'
geno_dir = f'{wrk_dir}/genotypes'
expr_dir = f'{wrk_dir}/expression'
info_dir = f'{wrk_dir}/sample_info'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files
expr_bed_file = f'{expr_dir}/{cohort_build}.norm.adj.bed.gz'
bfile_prefix_path = f'{geno_dir}/{cohort_version}.bfile'
assay_covs_files = f'{info_dir}/foundin_rnab_sample_info.csv'

# output files
used_samples_list_file = f'{info_dir}/{cohort_build}.eqtl.samples'
cis_indep_file = f'{results_dir}/{cohort_build}.cis.indep.csv'
cis_map_file = f'{tensorqtl_dir}/{cohort_build}.cis.map.csv'

# constant values
alpha_value = 0.05
min_nominal_alpha = 1e-05
cell_types_list = ['DopaminergicNeurons', 'EarlyneuronProgenitor',
                   'ImmatureDopaminergicNeurons', 'ProliferatingFloorPlateProgenitors',
                   'LateneuronProgenitor', 'Ependymal-likeCells',
                   'Neuroepithelial-likeCells']
repeated_samples_to_exclude = [f'RNAB_PPMI3966B1_2813_{day}_v1', f'RNAB_PPMI3966B2_2813_{day}_v2',
                               f'RNAB_PPMI3966B5_2813_{day}_v1', f'RNAB_PPMI3966B2_2813_{day}_v1',
                               f'RNAB_PPMI3966B1_2813_{day}_v2', f'RNAB_PPMI3966B1_2813_{day}_v3',
                               f'RNAB_PPMI3966B1_2813_{day}_v4', f'RNAB_PPMI3966B1_2813_{day}_v5',
                               f'RNAB_PPMI3966B1_2813_{day}_v6', f'RNAB_PPMI3966B1_2813_{day}_v7',
                               f'RNAB_PPMI3966B1_2813_{day}_v8', f'RNAB_PPMI3966B1_2813_{day}_v9']

In [None]:
os.makedirs(tensorqtl_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

#### utility functions

In [None]:
# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

#### load phenotypes and covariates (if needed)

In [None]:
%%time

phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expr_bed_file)
covs_df = pd.read_csv(assay_covs_files, index_col=0)
print(f'phenotype_df {phenotype_df.shape}')
print(f'phenotype_pos_df {phenotype_pos_df.shape}')
print(f'covariates_df {covs_df.shape}')
# display(phenotype_df.head())
# display(phenotype_pos_df.head())
# display(covs_df.head())

#### load plink bfiles

In [None]:
%%time

# pr = genotypeio.PlinkReader(bfile_prefix_path, select_samples=phenotype_df.columns)
pr = genotypeio.PlinkReader(bfile_prefix_path)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

In [None]:
print(genotype_df.shape)
# display(genotype_df.head())
print(variant_df.shape)
# display(variant_df.head())

In [None]:
# tensorQTL says wants plink bfiles, but wants bim chrs to include 'chr'
variant_df['chrom'] = 'chr' + variant_df['chrom']
print(variant_df.shape)
# display(variant_df.head())

#### make sure the pheno and genos have same samples

In [None]:
assay_intersect_samples = set(genotype_df.columns) & set(phenotype_df.columns) 
print(f'intersect {len(assay_intersect_samples)}')
extra_geno_samples = set(genotype_df.columns) - set(phenotype_df.columns)
print(f'number of genotypes samples not in expression {len(extra_geno_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(genotype_df.columns)
print(f'number of expression samples not in genotypes {len(extra_geno_samples)}')

# save the used sample list
pd.DataFrame(data=assay_intersect_samples).to_csv(used_samples_list_file, 
                                                  index=False, header=False)

#### drop the non-matched samples

In [None]:
genotype_df.drop(columns=extra_geno_samples, inplace=True)
phenotype_df.drop(columns=extra_expr_samples, inplace=True)

print(genotype_df.shape)
# display(genotype_df.head())
print(phenotype_df.shape)
# display(phenotype_df.head())

#### need to make sure phenos and genos have matched chromosomes; ie just autosomes

In [None]:
# need to ditch any non-autosomal genes
assay_intersect_chroms = set(phenotype_pos_df['chr']) & set(variant_df['chrom']) 
print(f'intersect {len(assay_intersect_chroms)}')
extra_geno_chroms = set(variant_df['chrom']) - set(phenotype_pos_df['chr'])
print(f'number of genotypes chroms not in expression {len(extra_geno_chroms)}')
print(extra_geno_chroms)
extra_expr_chroms = set(phenotype_pos_df['chr']) - set(variant_df['chrom'])
print(f'number of expression chroms not in genotypes {len(extra_expr_chroms)}')
print(extra_expr_chroms)

In [None]:
if len(extra_geno_chroms) > 0:
    variant_df = variant_df.loc[~variant_df['chrom'].isin(extra_geno_chroms)]
    # this will remove variants so need to remove them from genos df as well
    genotype_df = genotype_df.loc[genotype_df.index.isin(variant_df.index)]
if len(extra_expr_chroms) > 0:
    phenotype_pos_df = phenotype_pos_df.loc[~phenotype_pos_df['chr'].isin(extra_expr_chroms)]
    # this will remove genes so need to remove them from phenos df as well
    phenotype_df = phenotype_df.loc[phenotype_df.index.isin(phenotype_pos_df.index)]

print(genotype_df.shape)
# display(genotype_df.head())
print(variant_df.shape)
# display(variant_df.head())
print(phenotype_df.shape)
# display(phenotype_df.head())
print(phenotype_pos_df.shape)
# display(phenotype_pos_df.head())

#### make sure covariates match geno and pheno samples

In [None]:
# subest covs to just this 'day'; ie all differention days covs in file
# also since only interested in cell fractions as interaction terms, subset now
covs_df = covs_df.loc[(covs_df['day'] == day) & (covs_df['sampleid'].isin(phenotype_df.columns))]
covs_df = covs_df.loc[~covs_df.index.isin(repeated_samples_to_exclude)]
print(covs_df.shape)

cov_intersect_samples = set(phenotype_df.columns) & set(covs_df['sampleid']) 
print(f'intersect {len(cov_intersect_samples)}')
extra_expr_samples = set(phenotype_df.columns) - set(covs_df['sampleid'])
print(f'number of expression samples not in covariates {len(extra_expr_samples)}')
extra_cov_samples = set(covs_df['sampleid']) - set(phenotype_df.columns)
print(f'number of covariate samples not in genotypes {len(extra_cov_samples)}')

#### subset covariate to just desired (ie cell fractions) and shape for use with tensorqtl

In [None]:
covs_columns_to_use = ['sampleid'] + cell_types_list
covs_to_use = covs_df[covs_columns_to_use]
covs_to_use.set_index('sampleid', inplace=True)
# re-order columns to match phenotypes
covs_to_use = covs_to_use.transpose()
covs_to_use = covs_to_use[phenotype_df.columns]
# now transpose back
covs_to_use = covs_to_use.transpose()
print(covs_to_use.shape)
# display(covs_to_use.head())

#### *cis*-QTL: nominal p-values for all variant-phenotype pairs

In [None]:
%%time
# map all cis-associations (results for each chromosome are written to file)
# all genes
cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, 
                covariates_df=None, prefix=cohort_build, output_dir=tensorqtl_dir)
# # if using covariates
# cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, 
#                 covariates_df=covs_to_use, prefix=cohort_build, output_dir=tensorqtl_dir)

# if want single chromosome
# chrom = 'chr18'
# cis.map_nominal(genotype_df, variant_df,
#                 phenotype_df.loc[phenotype_pos_df['chr'] == chrom],
#                 phenotype_pos_df.loc[phenotype_pos_df['chr'] == chrom],
#                 prefix, covariates_df=covariates_df)

#### *cis*-QTL: empirical p-values for phenotypes

In [None]:
%%time
# all genes
cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df)
# # with covariates
# cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, 
#                      phenotype_pos_df, covariates_df=covs_to_use)

# note I commented out the following bit of code in tensorqtl/cis.py to reduce log spill
# logger.write('    * WARNING: excluding {} monomorphic variants'.format(mono_t.sum()))

# also commented printing this exception in core.py to reduce non-log spill
# print('WARNING: scipy.optimize.newton failed to converge (running scipy.optimize.minimize)')

In [None]:
# cis_df = pd.read_csv(cis_map_file, index_col=0)
print(cis_df.shape)
display(cis_df.head())

#### compute the FDR

In [None]:
# add the corrected p-value, note just based on all chrom gene pvalues    
# cis_df['bh_fdr'] = compute_fdr(cis_df['pval_perm'].fillna(1))
# cis_df['qval'] = compute_fdr(cis_df['pval_beta'].fillna(1))
# just using B&H FDR from statsmodel is approx equivalent to Storey qvalue, tested
cis_df['bh_fdr'] = compute_fdr(cis_df['pval_beta'].fillna(1))

# tensorQTL uses qvalue, but requires the R packages so use above BH FDR instead to approx
# tensorqtl.calculate_qvalues(cis_df, qvalue_lambda=0.85)

In [None]:
print(cis_df.shape)
display(cis_df.head())

In [None]:
print(cis_df.loc[cis_df['pval_nominal'] <= min_nominal_alpha].index.unique().shape)
print(cis_df.loc[cis_df['pval_perm'] <= alpha_value].index.unique().shape)
print(cis_df.loc[cis_df['pval_beta'] <= alpha_value].index.unique().shape)
print(cis_df.loc[cis_df['bh_fdr'] <= alpha_value].index.unique().shape)
# print(cis_df.loc[cis_df['qval'] <= alpha_value].index.unique().shape)

#### save cis map

In [None]:
cis_df.to_csv(cis_map_file)

#### map the loci independent signals

In [None]:
# use the B&H fdr instead of Storey qvalue
indep_df = cis.map_independent(genotype_df, variant_df, cis_df, phenotype_df, 
                               phenotype_pos_df, 
                               covariates_df=pd.DataFrame(index=phenotype_df.columns), 
                               fdr_col='bh_fdr')
# # with covariates 
# indep_df = cis.map_independent(genotype_df, variant_df, cis_df, phenotype_df, 
#                                phenotype_pos_df, 
#                                covariates_df=covs_to_use, 
#                                fdr_col='bh_fdr')

In [None]:
print(indep_df.shape)
display(indep_df.head())
print(indep_df['phenotype_id'].unique().shape)

In [None]:
indep_df['rank'].value_counts()

#### save the loci independent signals

In [None]:
indep_df.to_csv(cis_indep_file)

In [None]:
indep_df.loc[indep_df['pval_nominal'] == indep_df['pval_nominal'].min()]

In [None]:
indep_df.loc[indep_df['pval_nominal'] == indep_df['pval_nominal'].max()]