## Notebook to run *cis* correlation analysis between modalities using [tensorQTL](https://github.com/broadinstitute/tensorqtl)

here instead of using genotypes will use *cis* regulatory feature; tensorQTL use genotype dosage as independent variable so any quantative value can be used that has a genomic position but will have to map to single base versus interval

[Taylor-Weiner, Aguet, et al., Genome Biol. 20:228, 2019.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7)

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import torch
import tensorqtl
from tensorqtl import genotypeio, cis, trans
print(f'PyTorch {torch.__version__}')
print(f'Pandas {pd.__version__}')

import statsmodels.stats.multitest as smm

import warnings
warnings.filterwarnings('ignore')

In [None]:
# parameters
day = ''
exogenous = ''
endogenous = ''

In [None]:
# naming
cohort = 'foundin'
cohort_set = f'{cohort}_{day}'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files
endo_quants_bed_file = f'{quants_dir}/{cohort_set}_{endogenous}.scaled.adj.bed.gz'
exo_quants_bed_file = f'{quants_dir}/{cohort_set}_{exogenous}.scaled.adj.bed.gz'

# output files
used_samples_list_file = f'{info_dir}/{cohort_set}_{endogenous}-{exogenous}.samples'
cis_indep_file = f'{results_dir}/{cohort_set}_{endogenous}-{exogenous}.cis.indep.csv'
cis_map_file = f'{tensorqtl_dir}/{cohort_set}_{endogenous}-{exogenous}.cis.map.csv'

# constant values
alpha_value = 0.05
min_nominal_alpha = 1e-05
use_for_fdr = 'pval_perm' # 'pval_beta'
DEBUG = False
# tensorQTL defaults to 10K permutations, lower to speed up but lose specificity
NPERM = 10000

#### utility functions

In [None]:
# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

#### load endogenous features (phenotypes)

In [None]:
%%time

endogenous_df, endogenous_pos_df = tensorqtl.read_phenotype_bed(endo_quants_bed_file)
print(f'endogenous: {endogenous_df.shape}')
print(f'endogenous pos: {endogenous_pos_df.shape}')
if DEBUG:
    display(endogenous_df.head())
    display(endogenous_pos_df.head())

#### load exogenous features (for QTL this would be genotypes bfile)

In [None]:
%%time

exogenous_df, exogenous_pos_df = tensorqtl.read_phenotype_bed(exo_quants_bed_file)

# tensorqtl expect the genos_df to have 'chrom' & 'pos' columns correct here
exogenous_pos_df.rename(columns={'chr': 'chrom', 'tss': 'pos'}, inplace=True)

# think tensorqtl expects postion sorted otherwise 'cis' window does weird stuff
exogenous_pos_df = exogenous_pos_df.sort_values(by=['chrom', 'pos'])

# now reorder the exogenous dataframe to match the sorted positions
exogenous_df = exogenous_df.reindex(exogenous_pos_df.index)

print(f'exogenous: {exogenous_df.shape}')
print(f'exogenous pos: {exogenous_pos_df.shape}')
if DEBUG:
    display(exogenous_df.head())
    display(exogenous_pos_df.head())

#### make sure the pheno and genos have same samples

In [None]:
assay_intersect_samples = set(exogenous_df.columns) & set(endogenous_df.columns) 
print(f'intersect {len(assay_intersect_samples)}')
extra_exo_samples = set(exogenous_df.columns) - set(endogenous_df.columns)
print(f'number of exogenous samples not in endogenous {len(extra_exo_samples)}')
extra_endo_samples = set(endogenous_df.columns) - set(exogenous_df.columns)
print(f'number of endogenous samples not in exogenous {len(extra_endo_samples)}')

# save the used sample list
pd.DataFrame(data=assay_intersect_samples).to_csv(used_samples_list_file, 
                                                  index=False, header=False)

In [None]:
extra_endo_samples

#### drop the non-matched samples

In [None]:
exogenous_df.drop(columns=extra_exo_samples, inplace=True)
endogenous_df.drop(columns=extra_endo_samples, inplace=True)

print(exogenous_df.shape)
print(endogenous_df.shape)
if DEBUG:
    display(exogenous_df.head())
    display(endogenous_df.head())

#### need to make sure phenos and genos have matched chromosomes; ie just autosomes

In [None]:
# need to ditch any non-autosomal genes
assay_intersect_chroms = set(endogenous_pos_df['chr']) & set(exogenous_pos_df['chrom']) 
print(f'intersect {len(assay_intersect_chroms)}')
extra_exo_chroms = set(exogenous_pos_df['chrom']) - set(endogenous_pos_df['chr'])
print(f'number of exogenous chroms not in endogenous {len(extra_exo_chroms)}')
print(extra_exo_chroms)
extra_endo_chroms = set(endogenous_pos_df['chr']) - set(exogenous_pos_df['chrom'])
print(f'number of endogenous chroms not in exogenous {len(extra_endo_chroms)}')
print(extra_endo_chroms)

In [None]:
# make sure the sexomes are removed is they happen to still be present
sexomes = set(['chrX', 'chrY'])
extra_exo_chroms = extra_exo_chroms | sexomes
extra_endo_chroms = extra_endo_chroms | sexomes

if len(extra_exo_chroms) > 0:
    exogenous_pos_df = exogenous_pos_df.loc[~exogenous_pos_df['chrom'].isin(extra_exo_chroms)]
    # this will remove variants so need to remove them from genos df as well
    exogenous_df = exogenous_df.loc[exogenous_df.index.isin(exogenous_pos_df.index)]
if len(extra_endo_chroms) > 0:
    endogenous_pos_df = endogenous_pos_df.loc[~endogenous_pos_df['chr'].isin(extra_endo_chroms)]
    # this will remove genes so need to remove them from phenos df as well
    endogenous_df = endogenous_df.loc[endogenous_df.index.isin(endogenous_pos_df.index)]

print(f'geno shape: {exogenous_df.shape}')
print(f'variant shape: {exogenous_pos_df.shape}')
print(f'pheno shape: {endogenous_df.shape}')
print(f'pheno pos: {endogenous_pos_df.shape}')

if DEBUG:
    display(exogenous_df.head())    
    display(exogenous_pos_df.head())
    display(endogenous_df.head())
    display(endogenous_pos_df.head())

#### *cis*-QTL: nominal p-values for all variant-phenotype pairs

In [None]:
%%time
# map all cis-associations (results for each chromosome are written to file)
# all features
cis.map_nominal(exogenous_df, exogenous_pos_df, endogenous_df, endogenous_pos_df, 
                covariates_df=None, prefix=f'{cohort_set}_{endogenous}-{exogenous}', 
                output_dir=tensorqtl_dir, run_eigenmt=True, write_top=True, 
                write_stats=True, verbose=False)

#### *cis*-QTL: empirical p-values for phenotypes

In [None]:
%%time
# all genes
cis_df = cis.map_cis(exogenous_df, exogenous_pos_df, endogenous_df, endogenous_pos_df, 
                     verbose=False, nperm=NPERM)
# don't have to replace the monorphic anymore tensorqtl added flag to silence
# note I commented out the following bit of code in tensorqtl/cis.py to reduce log spill
# logger.write('    * WARNING: excluding {} monomorphic variants'.format(mono_t.sum()))

# commented printing this exception in core.py to reduce non-log spill
# print('WARNING: scipy.optimize.newton failed to converge (running scipy.optimize.minimize)')

#### compute the FDR

In [None]:
# add the corrected p-value, note just based on all chrom features pvalues    
# just using B&H FDR from statsmodel is approx equivalent to Storey qvalue, tested
cis_df['bh_fdr'] = compute_fdr(cis_df[use_for_fdr].fillna(1))

# tensorQTL uses qvalue, but requires the R packages so use above BH FDR instead to approx
# tensorqtl.calculate_qvalues(cis_df, qvalue_lambda=0.85)

In [None]:
print(f'cis shape: {cis_df.shape}')
if DEBUG:
    display(cis_df.head())

In [None]:
print(cis_df.loc[cis_df['pval_nominal'] <= min_nominal_alpha].index.unique().shape)
print(cis_df.loc[cis_df['pval_perm'] <= alpha_value].index.unique().shape)
print(cis_df.loc[cis_df['pval_beta'] <= alpha_value].index.unique().shape)
print(cis_df.loc[cis_df['bh_fdr'] <= alpha_value].index.unique().shape)
# print(cis_df.loc[cis_df['qval'] <= alpha_value].index.unique().shape)().shape)

#### save cis map

In [None]:
%%time
cis_df.to_csv(cis_map_file)

In [None]:
!date