## Notebook to run a meta-analysis between the FOUNDIN-PD day 65 and the Jerber et al HIPSCI day 52 (untreated) eQTL results

for modalities FOUNDIN-PD da65 RNAB & SCRN-DA and HipSci D52 pseudobulk and DA (both untreated)

In [None]:
!date

#### import libraries

In [None]:
from scipy.stats import norm
from pandas import read_csv, read_pickle, read_parquet, DataFrame
from multiprocessing import Process 
import subprocess
import numpy as np
from dask.dataframe import read_csv as dask_read_csv
import statsmodels.stats.multitest as smm
from seaborn import scatterplot

#### set notebook variables

In [None]:
# naming
project = 'foundin'
meta_set = 'Bulk-meta' # 'DAn-meta' or 'Bulk-meta'
result_set_name = f'foundin_daNA_{meta_set}'
if meta_set == 'Bulk-meta':
    foundin_set_name = 'foundin_da65_RNAB'
    hipsci_set_name = 'hipsci_D52_pseudobulk'
    foundin_sample_cnt = 91
    hipsci_sample_cnt = 193
elif meta_set == 'DAn-meta':
    foundin_set_name = 'foundin_da65_SCRN-DA'
    hipsci_set_name = 'hipsci_D52_DA'
    foundin_sample_cnt = 79
    hipsci_sample_cnt = 175

# directories 
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
public_dir = f'{wrk_dir}/public'
meta_dir = f'{wrk_dir}/meta'
results_dir = f'{wrk_dir}/results'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'

# in files
features_file = f'{public_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'

# out files
results_file = f'{meta_dir}/{result_set_name}_metal_eqtl.parquet'
tops_file = f'{meta_dir}/{result_set_name}_metal_eqtl_top.csv'

# variables
DEBUG = False
autosomes = range(1, 23)
alpha = 0.05

if DEBUG:
    print(f'features_file = {features_file}')
    print(f'results_file = {results_file}')
    print(f'tops_file = {tops_file}')

#### functions

In [None]:
def run_bash_cmd(cmd: str, verbose: bool=False):
    if verbose:
        print(cmd)
    subprocess.run(cmd, shell=True)
    
# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

def read_metal_results(files_path: str, verbose: bool=False) -> DataFrame:
    meta_dd = dask_read_csv(files_path, sep='\t', dtype={'HetPVal': 'float32'})
    # go ahead and convert from dask to pandas data frame
    meta_df = meta_dd.compute()
    print(f'meta results shape {meta_df.shape}')
    if verbose:
        display(meta_df.head())
    return meta_df    

### load data, format data, and save new input files

#### load feature annotations

In [None]:
features_df = read_pickle(features_file)
# drop the ont and tag columns
discard_cols = features_df.columns[(features_df.columns.str.startswith('ont')) |
                                   (features_df.columns.str.startswith('tag')) | 
                                   (features_df.columns.str.startswith('havana_')) |                                       
                                   (features_df.columns.str.startswith('gene_alias')) | 
                                   (features_df.columns.str.startswith('transcript_alias'))]
features_df.drop(columns=discard_cols, inplace=True)
# subset to just 'gene' features
features_df = features_df.loc[features_df.feature == 'gene']
# now drop existing feature col so we can use that name
features_df.drop(columns=['feature'], inplace=True)
features_df.rename(columns={'seqname': 'chrom', 'gene_id': 'feature'}, inplace=True)    
# if 'SCRN'
# features_df.rename(columns={'seqname': 'chrom', 'gene_name': 'feature'}, inplace=True)
print(f'features shape {features_df.shape}')
if DEBUG:
    display(features_df.head())    

#### make feature dictionary to convert foundin Ensemble IDs to gene names

In [None]:
if meta_set == 'Bulk-meta':
    features_dict = features_df.set_index('feature').to_dict()['gene_name']
else:
    features_dict = None

In [None]:
test_chrom = 'chr8'
test_gene = 'BIN3'
test_geneid = 'ENSG00000147439'
test_variant = 'rs2280104'
test_qtl_pair = test_gene + ':' + test_variant

#### FOUNDIN-PD modality results

In [None]:
def prep_foundin_by_chrom(chromosome: str, name: str, variants: DataFrame, features: dict):
    foundin_file = f'{tensorqtl_dir}/{name}.cis_qtl_pairs.chr{chromosome}.parquet'
    foundin_df = read_parquet(foundin_file)
    if DEBUG:
        print(f'read {foundin_df.shape}')
    
    # merge allele info into results, split qtl_pair values
    foundin_df = foundin_df.merge(variants, how='left', left_on='variant_id', right_on='variant')
    # drop the duplicate variant column
    foundin_df = foundin_df.drop(columns=['variant'])
    if DEBUG:
        print(f'merged with variant info {foundin_df.shape}')

    if meta_set == 'Bulk-meta':
        # map feature ID to feature name
        foundin_df['feature_name'] = foundin_df.phenotype_id.map(features).fillna(foundin_df.phenotype_id)
        # add marker name, here for QTL marker name is test pair of gene (name) and variant (dbSNP Id)
        foundin_df['qtl_pair'] = foundin_df.feature_name + ':' + foundin_df.variant_id        
    else:
        # add marker name, here for QTL marker name is test pair of gene (name) and variant (dbSNP Id)
        foundin_df['qtl_pair'] = foundin_df.phenotype_id + ':' + foundin_df.variant_id
    if DEBUG:
        print(f'added qtl pair name {foundin_df.shape}')
        if f'chr{chromosome}' == test_chrom:
            # check the test pair
            display(foundin_df.loc[foundin_df.qtl_pair == test_qtl_pair])
            
    # save QTL input for METAL by chromosome
    # format the out file for use with METAL
    out_columns = ['qtl_pair', 'effect_allele', 'other_allele', 'slope', 'slope_se', 
                   'pval_nominal', 'phenotype_id', 'variant_id']
    if meta_set == 'Bulk-meta':
        out_columns = out_columns + ['feature_name']
    chrom_df = foundin_df[out_columns]
    # drop any duplicates, usually from merges
    chrom_df = chrom_df.drop_duplicates(subset=['qtl_pair'], keep='first')    
    out_file = f'{meta_dir}/{name}.{chromosome}.metal_in.csv.gz'
    chrom_df.to_csv(out_file, index=False, compression='gzip')    

In [None]:
%%time
# load the variant info file
foundin_var_file = f'{wrk_dir}/genotypes/foundin.amppdv1.bfile.bim'
vars_df = read_csv(foundin_var_file, sep='\s+')
vars_df.columns = ['chrom', 'variant', 'cM', 'bp', 'effect_allele', 'other_allele']
print(f'variant info {vars_df.shape}')
if DEBUG:
    display(vars_df.head())

cmds = {}
# process data by chromosome
for chrom in autosomes:
    print(f'chr{chrom}', end='.')
    p = Process(target=prep_foundin_by_chrom,args=(chrom, foundin_set_name, vars_df, features_dict))
    p.start()
    # Append process and key to keep track
    cmds[chrom] = p    
# Wait for all processes to finish
for key, p in cmds.items():
    p.join()

#### HipSci day 52 modality results

##### make feature dictionary to convert hipsci unversioned Ensemble IDs to gene names

In [None]:
feature_version = features_df.feature.str.split('.', expand=True)
feature_version.columns = ['ensembl_id', 'ensembl_version']
features_df['ensembl_id'] = feature_version.ensembl_id
if DEBUG:
    display(feature_version.head())
    display(features_df.head())

In [None]:
features_dict = features_df.set_index('ensembl_id').to_dict()['gene_name']

In [None]:
%%time
qtl_file = f'{public_dir}/jerber_da_neuron_eqtl/{hipsci_set_name}.untreated.qtl_results_all.hg38.parquet'

hipsci_df = read_parquet(qtl_file)
hipsci_df.rename(columns={'id': 'variant_id'}, inplace=True)

if DEBUG:
    print(f'read {hipsci_df.shape}')
    
# map feature ID to feature name
hipsci_df['feature_name'] = hipsci_df.feature_id.map(features_dict).fillna(hipsci_df.feature_id)

if DEBUG:
    print(f'updated shape {hipsci_df.shape}')
    
# add marker name, here for QTL marker name is test pair of gene (name) and variant (dbSNP Id)
hipsci_df['qtl_pair'] = hipsci_df.feature_name + ':' + hipsci_df.variant_id
if DEBUG:
    print(f'added qtl pair name {hipsci_df.shape}')
    # check the test pair
    display(hipsci_df.loc[hipsci_df.qtl_pair == test_qtl_pair])
    
# save QTL input for METAL by chromosome
# format the out file for use with METAL
out_columns = ['qtl_pair', 'effect_allele', 'other_allele', 'beta', 'beta_se', 
               'p_value', 'maf', 'hwe_p', 'n_samples', 'feature_id', 
               'feature_name', 'variant_id']
for chrom in autosomes:
    print(f'chr{chrom}', end='.')
    chrom_df = hipsci_df.loc[hipsci_df.chrom == f'chr{chrom}', out_columns]
    # drop any duplicates, usually from merges
    chrom_df = chrom_df.drop_duplicates(subset=['qtl_pair'], keep='first')    
    out_file = f'{meta_dir}/{hipsci_set_name}.{chrom}.metal_in.csv.gz'
    chrom_df.to_csv(out_file, index=False, compression='gzip')    

### format the METAL script file

In [None]:
scheme_type = 'SAMPLESIZE' # or STDERR
for chrom in autosomes:
    control_file = f'{meta_dir}/{meta_set}_eqtl.metal_cntrl.{chrom}.txt'
    foundin_file_path = f'{meta_dir}/{foundin_set_name}.{chrom}.metal_in.csv.gz'
    hipsci_file_path = f'{meta_dir}/{hipsci_set_name}.{chrom}.metal_in.csv.gz'
    meta_out_file = f'{meta_dir}/{result_set_name}_metal_eqtl.{chrom}.txt'

    metal_header = f'#THIS SCRIPT EXECUTES AN ANALYSIS OF MULTIPLE QTL STUDIES\n\
SCHEME {scheme_type}\n\n'
    
    metal_foundin_section = f'# Describe and process the {foundin_set_name} input files\n\
SEPARATOR COMMA\n\
MARKER qtl_pair\n\
ALLELE effect_allele other_allele\n\
EFFECT slope\n\
STDERR slope_se\n\
PVALUE pval_nominal\n\
WEIGHTLABEL DONTUSECOLUMN\n\
DEFAULTWEIGHT {foundin_sample_cnt}\n\
VERBOSE OFF\n\
PROCESS {foundin_file_path}\n\n'    

    metal_hipsci_section = f'# Describe and process the {hipsci_set_name} input files\n\
SEPARATOR COMMA\n\
MARKER qtl_pair\n\
ALLELE effect_allele other_allele\n\
EFFECT beta\n\
STDERR beta_se\n\
PVALUE p_value\n\
WEIGHTLABEL DONTUSECOLUMN\n\
DEFAULTWEIGHT {hipsci_sample_cnt}\n\
VERBOSE OFF\n\
PROCESS {hipsci_file_path}\n\n'

    metal_footer = f'OUTFILE {meta_out_file} .tbl\n\
# for now turning het analysis off something on chrom 1 keeps failing\n\
ANALYZE # HETEROGENEITY\n\n\
QUIT'
    
    
    
    control_script = (metal_header + metal_foundin_section + 
                      metal_hipsci_section + metal_footer)
    with open(control_file, 'w') as out_file:
        out_file.write(control_script)

### run metal per chromosome

In [None]:
%%time
cmds = {}
for chrom in autosomes:
    control_file = f'{meta_dir}/{meta_set}_eqtl.metal_cntrl.{chrom}.txt'
    log_file = f'{meta_dir}/{meta_set}_eqtl.{chrom}.metal.log'
    this_cmd = f'metal {control_file} > {log_file} '
    p = Process(target=run_bash_cmd,args=(this_cmd, DEBUG))
    p.start()
    # Append process and key to keep track
    cmds[chrom] = p    
# Wait for all processes to finish
for key, p in cmds.items():
    p.join()

### post process the meta results

In [None]:
%%time
path = f'{meta_dir}/{result_set_name}_metal_eqtl.*.txt1.tbl'
meta_df = read_metal_results(path, DEBUG)
# here only keep those results that were included from both studies
# since we only have two studies
if DEBUG:
    display(meta_df.Direction.value_counts())    
meta_df = meta_df.loc[meta_df.Direction.isin(['++', '--', '-+', '+-'])]
print(f'updated shape {meta_df.shape}')

#### merge on the allele frequencies
here will just use the HipSci freqs as they are already loaded in those results, that study is also larger and we are only considering results present in both studies

In [None]:
%%time
# merge allele frequencies
meta_df = meta_df.merge(hipsci_df[['qtl_pair', 'maf']], how='left', left_on='MarkerName', right_on='qtl_pair')
# drop the duplicate variant ID column
meta_df.drop(columns=['qtl_pair'], inplace=True)
# drop any duplicates from the merge
meta_df = meta_df.drop_duplicates(subset=['MarkerName'], keep='first')
print(f'updated shape {meta_df.shape}')
if DEBUG:
    display(meta_df.sample(10))

In [None]:
meta_df.loc[meta_df.MarkerName == test_qtl_pair]

#### go ahead and split the QTL pair back into trait and variant

In [None]:
# split QTL pair (markername) back into trait and variant columns
temp_df = meta_df.MarkerName.str.split(':', n=1, expand=True)
temp_df.columns = ['trait', 'variant']
meta_df['trait'] = temp_df.trait
meta_df['variant'] = temp_df.variant
print(f'updated shape {meta_df.shape}')
if DEBUG:
    display(meta_df.sample(10))

### compute B&H FDR

In [None]:
# apply B&H FDR corrections to results
meta_df['bh_fdr'] = compute_fdr(meta_df['P-value'].fillna(1))
print(f'updated shape {meta_df.shape}')
if DEBUG:
    display(meta_df.sample(10))

In [None]:
# apply psuedo Bonferroni to results
meta_df['pseudo_bonf'] = meta_df['P-value'].fillna(1) * meta_df.trait.nunique()
meta_df.loc[meta_df.pseudo_bonf > 1, 'pseudo_bonf'] = 1
print(f'updated shape {meta_df.shape}')
if DEBUG:
    display(meta_df.sample(10))

### save the meta-analysis results

In [None]:
meta_df.to_parquet(results_file, index=False)

### save the meta-analysis per feature tops

In [None]:
# get top result per trait with a significant result
tops_df = meta_df.sort_values('bh_fdr').drop_duplicates(['trait'], keep='first')
print(f'shape of just the top hits for each trait QTL {tops_df.shape}')
if DEBUG:
    display(tops_df.head())
    display(tops_df.Direction.value_counts())
# save each traits top result with FDR included
tops_df.to_csv(tops_file, index=False)

### subset results

In [None]:
# subset to significant results using B&H FDR
sig_meta_df = meta_df.loc[(meta_df.bh_fdr < alpha) & 
                          (meta_df.Direction.isin(['++', '--']))].copy()
print(f'significant results shape {sig_meta_df.shape}')
# how many traits have significant result
print(f'{sig_meta_df.trait.nunique()} traits with significant QTL')
# how many traits in total were tested
print(f'{meta_df.trait.nunique()} total traits tested for QTL')

In [None]:
# subset to significant results using approx Bonferroni
sig_meta_df = meta_df.loc[(meta_df.pseudo_bonf < alpha) & 
                          (meta_df.Direction.isin(['++', '--']))].copy()
print(f'significant results shape {sig_meta_df.shape}')
# how many traits have significant result
print(f'{sig_meta_df.trait.nunique()} traits with significant QTL')
# how many traits in total were tested
print(f'{meta_df.trait.nunique()} total traits tested for QTL')

In [None]:
!date