## Notebook to run *cis* QTL analysis between genotype and modalities using [tensorQTL](https://github.com/broadinstitute/tensorqtl)

[Taylor-Weiner, Aguet, et al., Genome Biol. 20:228, 2019.](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7)

In [None]:
!date

#### import libraries

In [None]:
import pandas as pd
import torch
# import tensorqtl
from tensorqtl import read_phenotype_bed
from tensorqtl import genotypeio, cis, trans
print(f'PyTorch {torch.__version__}')
print(f'Pandas {pd.__version__}')

import statsmodels.stats.multitest as smm

import warnings
warnings.filterwarnings('ignore')

#### set notebook variables

In [None]:
# parameters
modality = ''
day = ''

In [None]:
# naming
cohort = 'foundin'
set_name = f'{cohort}_{day}_{modality}'

# directories
wrk_dir = '/home/gibbsr/working/foundin/foundin_qtl'
quants_dir = f'{wrk_dir}/quants'
geno_dir = f'{wrk_dir}/genotypes'
info_dir = f'{wrk_dir}/sample_info'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'

# input files
endo_quants_bed_file = f'{quants_dir}/{set_name}.scaled.adj.bed.gz'
bfile_prefix_path = f'{geno_dir}/{cohort}.amppdv1.bfile'
covariates_file = f'{info_dir}/{cohort}_{modality}_sample_info.csv'

# output files
used_samples_list_file = f'{info_dir}/{set_name}.samples'
cis_map_file = f'{tensorqtl_dir}/{set_name}.cis.map.csv'
cis_indep_file = f'{tensorqtl_dir}/{set_name}.cis.indep.csv'

# constant values
alpha_value = 0.05
min_nominal_alpha = 1e-05
use_for_fdr = 'pval_perm' # pval_beta
DEBUG = False
# tensorQTL defaults to 10K permutations, lower to speed up but lose specificity
NPERM = 10000
# tensorQTL defaults to 0
MIN_MAF = 0.05
covs_columns_to_use = ['female', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'nonDA']
repeated_samples_to_exclude = [f'RNAB_PPMI3966B1v1_{day}', f'RNAB_PPMI3966B1v2_{day}',
                               f'RNAB_PPMI3966B1v3_{day}', f'RNAB_PPMI3966B1v4_{day}',
                               f'RNAB_PPMI3966B1v5_{day}', f'RNAB_PPMI3966B1v6_{day}',
                               f'RNAB_PPMI3966B1v7_{day}', f'RNAB_PPMI3966B1v8_{day}',
                               f'RNAB_PPMI3966B1v9_{day}', f'RNAB_PPMI3966B2v1_{day}',
                               f'RNAB_PPMI3966B2v2_{day}', f'RNAB_PPMI3966B5v1_{day}']

#### utility functions

In [None]:
# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

#### see if torch detects a GPU

In [None]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}\n')
#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
print(torch.version.cuda)    

### load input data

#### load endogenous features (phenotypes)

In [None]:
%%time

endogenous_df, endogenous_pos_df = read_phenotype_bed(endo_quants_bed_file)
print(f'endogenous: {endogenous_df.shape}')
print(f'endogenous pos: {endogenous_pos_df.shape}')
if DEBUG:
    display(endogenous_df.head())
    display(endogenous_pos_df.head())

#### load exogenous features (plink genotypes bfile)

In [None]:
%%time
pr = genotypeio.PlinkReader(bfile_prefix_path)
exogenous_df = pr.load_genotypes()
exogenous_pos_df = pr.bim.set_index('snp')[['chrom', 'pos']]

print(f'exogenous: {exogenous_df.shape}')
print(f'exogenous pos: {exogenous_pos_df.shape}')
if DEBUG:
    display(exogenous_df.head())
    display(exogenous_pos_df.head())

In [None]:
# tensorQTL says wants plink bfiles, but wants bim chrs to include 'chr'
exogenous_pos_df['chrom'] = 'chr' + exogenous_pos_df['chrom']
print(exogenous_pos_df.shape)
if DEBUG:
    display(exogenous_pos_df.head())

#### load the covariates

In [None]:
covs_df = pd.read_csv(covariates_file, index_col=0)
print(f'covariates shape {covs_df.shape}')
if DEBUG:
    display(covs_df.head())

#### create a binarized covariate for sex

In [None]:
covs_df['female'] = 0
covs_df.loc[covs_df.sex == 'Female', 'female'] = 1
display(covs_df.sex.value_counts())
display(covs_df.female.value_counts())

#### create a combine non-DA neuron fraction as a covariate

if SCRN modality don't include this covariate term

In [None]:
covs_df['nonDA'] = 1 - covs_df.DAn
display(covs_df.nonDA.describe())
if DEBUG:
    display(covs_df.head())

### make sure the pheno and genos have same samples

In [None]:
assay_intersect_samples = set(exogenous_df.columns) & set(endogenous_df.columns) 
print(f'intersect {len(assay_intersect_samples)}')
extra_exo_samples = set(exogenous_df.columns) - set(endogenous_df.columns)
print(f'number of exogenous samples not in endogenous {len(extra_exo_samples)}')
extra_endo_samples = set(endogenous_df.columns) - set(exogenous_df.columns)
print(f'number of endogenous samples not in exogenous {len(extra_endo_samples)}')

# save the used sample list
pd.DataFrame(data=assay_intersect_samples).to_csv(used_samples_list_file, 
                                                  index=False, header=False)

In [None]:
print(extra_exo_samples, extra_endo_samples)

#### drop the non-matched samples

In [None]:
exogenous_df.drop(columns=extra_exo_samples, inplace=True)
endogenous_df.drop(columns=extra_endo_samples, inplace=True)

print(exogenous_df.shape)
print(endogenous_df.shape)
if DEBUG:
    display(exogenous_df.head())
    display(endogenous_df.head())

### need to make sure phenos and genos have matched chromosomes; ie just autosomes

In [None]:
# need to ditch any non-autosomal genes
assay_intersect_chroms = set(endogenous_pos_df['chr']) & set(exogenous_pos_df['chrom']) 
print(f'intersect {len(assay_intersect_chroms)}')
extra_exo_chroms = set(exogenous_pos_df['chrom']) - set(endogenous_pos_df['chr'])
print(f'number of exogenous chroms not in endogenous {len(extra_exo_chroms)}')
print(extra_exo_chroms)
extra_endo_chroms = set(endogenous_pos_df['chr']) - set(exogenous_pos_df['chrom'])
print(f'number of endogenous chroms not in exogenous {len(extra_endo_chroms)}')
print(extra_endo_chroms)

In [None]:
# make sure the sexomes are removed is they happen to still be present
sexomes = set(['chrX', 'chrY'])
extra_exo_chroms = extra_exo_chroms | sexomes
extra_endo_chroms = extra_endo_chroms | sexomes

if len(extra_exo_chroms) > 0:
    exogenous_pos_df = exogenous_pos_df.loc[~exogenous_pos_df['chrom'].isin(extra_exo_chroms)]
    # this will remove variants so need to remove them from genos df as well
    exogenous_df = exogenous_df.loc[exogenous_df.index.isin(exogenous_pos_df.index)]
if len(extra_endo_chroms) > 0:
    endogenous_pos_df = endogenous_pos_df.loc[~endogenous_pos_df['chr'].isin(extra_endo_chroms)]
    # this will remove genes so need to remove them from phenos df as well
    endogenous_df = endogenous_df.loc[endogenous_df.index.isin(endogenous_pos_df.index)]

print(f'geno shape: {exogenous_df.shape}')
print(f'variant shape: {exogenous_pos_df.shape}')
print(f'pheno shape: {endogenous_df.shape}')
print(f'pheno pos: {endogenous_pos_df.shape}')

if DEBUG:
    display(exogenous_df.head())    
    display(exogenous_pos_df.head())
    display(endogenous_df.head())
    display(endogenous_pos_df.head())

### make sure covariates match geno and pheno samples

In [None]:
# subest covs to just this 'day'; ie all differention days covs in file
# also since only interested in cell fractions as interaction terms, subset now
covs_df = covs_df.loc[(covs_df.day == day) & 
                      (covs_df.sampleid.isin(endogenous_df.columns))]
covs_df = covs_df.loc[~covs_df.index.isin(repeated_samples_to_exclude)]
print(f'covs shape {covs_df.shape}')

cov_intersect_samples = set(endogenous_df.columns) & set(covs_df.sampleid) 
print(f'intersect {len(cov_intersect_samples)}')
extra_expr_samples = set(endogenous_df.columns) - set(covs_df.sampleid)
print(f'number of endogenous samples not in covariates {len(extra_expr_samples)}')
extra_cov_samples = set(covs_df.sampleid) - set(endogenous_df.columns)
print(f'number of covariate samples not in exogenous {len(extra_cov_samples)}')

#### subset covariate to just desired (ie cell fractions) and shape for use with tensorqtl

In [None]:
covs_to_use = covs_df[['sampleid'] + covs_columns_to_use]
covs_to_use.set_index('sampleid', inplace=True)
# re-order columns to match phenotypes
covs_to_use = covs_to_use.transpose()
covs_to_use = covs_to_use[endogenous_df.columns]
# now transpose back
covs_to_use = covs_to_use.transpose()
print(covs_to_use.shape)
if DEBUG:
    display(covs_to_use.head())

#### in rare instances a single sample will be missing a covariate, mean fill for simplicity

In [54]:
for covariate in covs_to_use.columns:
    mean_val = covs_to_use[covariate].mean()
    if covs_to_use[covariate].nunique() == 2:
        mean_val = int(mean_val)
    covs_to_use[covariate].fillna(mean_val, inplace=True)

### *cis*-QTL: nominal p-values for all variant-phenotype pairs

In [None]:
%%time
# map all cis-associations (results for each chromosome are written to file)
# all features
cis.map_nominal(exogenous_df, exogenous_pos_df, endogenous_df, endogenous_pos_df, 
                covariates_df=covs_to_use, prefix=f'{set_name}', 
                output_dir=tensorqtl_dir, run_eigenmt=True, write_top=True, 
                write_stats=True, verbose=False, maf_threshold=MIN_MAF)

### *cis*-QTL: empirical p-values for phenotypes

In [None]:
%%time
# all genes
cis_df = cis.map_cis(exogenous_df, exogenous_pos_df, endogenous_df, endogenous_pos_df, 
                     covariates_df=covs_to_use, verbose=False, nperm=NPERM, 
                     maf_threshold=MIN_MAF)

#### compute the FDR

In [None]:
# add the corrected p-value, note just based on all chrom features pvalues    
# just using B&H FDR from statsmodel is approx equivalent to Storey qvalue, tested
# cis_df['bh_fdr'] = compute_fdr(cis_df['pval_beta'].fillna(1))
cis_df['bh_fdr'] = compute_fdr(cis_df[use_for_fdr].fillna(1))

# tensorQTL uses qvalue, but requires the R packages so use above BH FDR instead to approx
# tensorqtl.calculate_qvalues(cis_df, qvalue_lambda=0.85)

In [None]:
print(f'cis shape: {cis_df.shape}')
if DEBUG:
    display(cis_df.head())

In [None]:
print(cis_df.loc[cis_df['pval_nominal'] <= min_nominal_alpha].index.nunique())
print(cis_df.loc[cis_df['pval_perm'] <= alpha_value].index.unique().nunique())
print(cis_df.loc[cis_df['pval_beta'] <= alpha_value].index.unique().nunique())
sig_cnt = cis_df.loc[cis_df['bh_fdr'] <= alpha_value].index.unique().nunique()
print(sig_cnt)
# print(cis_df.loc[cis_df['qval'] <= alpha_value].index.unique().shape)().shape)

#### save cis map

In [None]:
%%time
cis_df.to_csv(cis_map_file)

### map the loci independent signals

In [None]:
# use the B&H fdr instead of Storey qvalue
if sig_cnt > 0:
    indep_df = cis.map_independent(exogenous_df, exogenous_pos_df, cis_df, 
                                   endogenous_df, endogenous_pos_df, 
                                   covariates_df=covs_to_use, 
                                   fdr_col='bh_fdr', verbose=False, nperm=NPERM,
                                   maf_threshold=MIN_MAF)
else:
    print('not running map independent as no significant results were present in cis')

In [None]:
if sig_cnt > 0:
    print(indep_df.shape)
    display(indep_df.head())
    print(indep_df['phenotype_id'].nunique())
    display(indep_df['rank'].value_counts())

#### save the loci independent signals

In [None]:
if sig_cnt > 0:
    indep_df.to_csv(cis_indep_file)

In [None]:
if sig_cnt > 0:
    display(indep_df.loc[indep_df['pval_nominal'] == indep_df['pval_nominal'].min()])
    display(indep_df.loc[indep_df['pval_nominal'] == indep_df['pval_nominal'].max()])

In [None]:
!date