## Notebook for plotting gene specific eQTL results

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# parameters
cohort = 'foundin'
gene = 'KANSL1-AS1'
cell_types = ['iDA', 'DA', 'lNP']

In [None]:
# naming
geno_version = 'amppdv1'
cohort_version = f'{cohort}.{geno_version}'

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/sceqtl'
quants_dir = f'{wrk_dir}/quants'
genos_dir = f'{wrk_dir}/genotypes'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
gwas_dir = f'{wrk_dir}/public'

# input files
gencode_pkl = gencode_pkl = f'{quants_dir}/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
meta5_stats_file = f'{gwas_dir}/pdmeta_sumstats_hg38.h5'

# output files

# constant values
alpha_value = 0.05
day = 'da65'

#### analysis functions

In [None]:
def mixed_model(formula, df, group_name):
    model = sm.MixedLM.from_formula(formula, df, groups=df[group_name])
    result = model.fit()
    return result

#### load the gencode annotations

In [None]:
%%time
gencode_df = pd.read_pickle(gencode_pkl)
print(gencode_df.shape)

#### load the full gwas summary stats

In [None]:
%%time
gwas_stats_df = pd.read_hdf(meta5_stats_file)
print(gwas_stats_df.shape)
display(gwas_stats_df.head())

#### load the cis-eQTL results by cell-type for the specified gene

In [None]:
%%time

gene_df = gencode_df.loc[gencode_df['gene_name'] == gene]
#     print(gene_df['seqname'].unique()[0])
gene_id = gene_df['gene_id'].unique()[0]
chrom = gene_df['seqname'].unique()[0]
gene_name = gene_df['gene_name'].unique()[0]
gene_start = gene_df['start'].min()
gene_stop = gene_df['end'].max()

gene_eqtl_df = None
for cell_type in cell_types:
    # now load the chromosome eqtl results and extract specific gene results
    chrom_eqtl_df = pd.read_parquet(f'{tensorqtl_dir}/{cohort}.{day}.{cell_type}.cis_qtl_pairs.{chrom}.parquet')
    chrom_bim_df = pd.read_csv(f'{genos_dir}/{cohort_version}.{chrom}.bfile.bim', 
                               header=None, sep='\s+')
    chrom_bim_df.columns = ['chr', 'name', 'cm', 'pos', 'a1', 'a2']
    chrom_eqtl_df = chrom_eqtl_df.merge(chrom_bim_df, how='inner', 
                                    left_on='variant_id', right_on='name')    
#     print(chrom_eqtl_df.shape)
    this_df = chrom_eqtl_df.loc[chrom_eqtl_df['phenotype_id'] == gene].copy()
    this_df['set'] = cell_type
    gene_eqtl_df = pd.concat([gene_eqtl_df, this_df])
#     print(gene_eqtl_df.shape)

#### get gwas stats for region of specified gene

In [None]:
region_gwas_df = gwas_stats_df.loc[gwas_stats_df['SNP'].isin(gene_eqtl_df['variant_id'])].copy()
region_gwas_df['set'] = 'PDrisk'
print(region_gwas_df.shape)

#### subset appropriate columns and rename so that eqtl and gwas can be concat'd

In [None]:
eqtl_columns = ['variant_id', 'chr', 'pos', 'slope', 'slope_se', 'pval_nominal','set']
gene_eqtl_df = gene_eqtl_df[eqtl_columns]
gene_eqtl_df.columns = ['variant', 'chr', 'pos', 'beta', 'se', 'pvalue','set']
print(gene_eqtl_df.shape)
# display(gene_eqtl_df.head())

gwas_columns = ['SNP', 'chr', 'position', 'b', 'se', 'p','set']
region_gwas_df = region_gwas_df[gwas_columns]
region_gwas_df.columns = ['variant', 'chr', 'pos', 'beta', 'se', 'pvalue','set']
print(region_gwas_df.shape)
# display(region_gwas_df.head())

results_df = pd.concat([gene_eqtl_df, region_gwas_df])
results_df['log10_pvalue'] = np.log10(results_df['pvalue'])*-1
results_df['score'] = results_df['beta']/results_df['se']
results_df['score_abs'] = np.abs(results_df['score'])
print(results_df.shape)
display(results_df.head())

#### now do the plotting

In [None]:
#plot local manhattan for gene eQTL
def plot_eqtl_manhattan(gene_id, gene_name, gene_chrom, gene_start, gene_stop, 
                        results_df):
    print(f'{gene_name} {gene_id}')
    print(f'gene {gene_name} is on {gene_chrom} from {gene_start} to {gene_stop}')

    #pull in all results for the gene from chromosome for all cell_types
    print(results_df.shape)

    #now actually do the plotting
    sns.set(style='darkgrid')
    sns.relplot(x='pos',y='log10_pvalue', hue='set',
                alpha=.5, palette="dark", height=12, data=results_df)        

    min_y = round(min(results_df['log10_pvalue']))

    plt.plot([gene_start, gene_stop], [min_y, min_y], linewidth=5, color='black')
    plt.text(gene_stop+10000, min_y, gene_name, fontsize='large', color='black')

    plt.title(f'{gene_name} eQTL', fontsize='large') 
    plt.xlabel(f'Bp on chromosome {gene_chrom}')
    plt.ylabel(f'-log10(p-value)')    
    plt.show()

#   plot_out_file_name = f'{WRKDIR}/plink/images/{gene_name}.local_man.png'
#   plt.savefig(plot_out_file_name,format='png',dpi=600,bbox_inches='tight')
    
    return

In [None]:
plot_eqtl_manhattan(gene_id, gene_name, chrom, gene_start, gene_stop, 
                    results_df)

In [None]:
gene_eqtl_df.columns = ['eqtl_'+ x for x in gene_eqtl_df.columns]
region_gwas_df.columns = ['gwas_'+ x for x in region_gwas_df.columns]
display(gene_eqtl_df.head())
display(region_gwas_df.head())


In [None]:
merged_df = gene_eqtl_df.merge(region_gwas_df, how='inner', 
                               left_on='eqtl_variant', right_on='gwas_variant')
merged_df['gwas_score_abs'] = np.abs(merged_df['gwas_beta']/merged_df['gwas_se'])
merged_df['eqtl_score_abs'] = np.abs(merged_df['eqtl_beta']/merged_df['eqtl_se'])
# ensure cell_type/eqtl_set is object data type instead of int so treated categorical
merged_df['eqtl_set'] =  merged_df['eqtl_set'].astype(object)
print(merged_df.shape)
display(merged_df.head())

In [None]:
# sns.relplot(x='gwas_beta',y='eqtl_beta', hue='eqtl_set', size=np.abs(merged_df['gwas_beta']),
#             alpha=.5, palette="dark", height=12, data=merged_df) 

In [None]:
#plot local manhattan for gene eQTL
def plot_gwas_eqtl(gene_id, gene_name, gene_chrom, gene_start, gene_stop, 
                   this_df):
    print(f'{gene_name} {gene_id}')
    print(f'gene {gene_name} is on {gene_chrom} from {gene_start} to {gene_stop}')

    #pull in all results for the gene from chromosome for all cell_types
    print(this_df.shape)

    #now actually do the plotting
    sns.set(style='darkgrid')
    sns.relplot(x=np.log10(this_df['gwas_pvalue'])*-1, 
                y=np.log10(this_df['eqtl_pvalue'])*-1, 
                hue='eqtl_set', alpha=.5, palette="dark", 
                height=12, data=this_df)         
    
    plt.title(f'PD risk and {gene_name} eQTL', fontsize='large')
    plt.xlabel('PD GWAS risk -log10(p-value)')
    plt.ylabel(f'{gene_name} eQTL -log10(p-value)')
    plt.show()
    
    sns.relplot(x='gwas_score_abs', 
                y='eqtl_score_abs', 
                hue='eqtl_set', alpha=.5, palette="dark", 
                height=12, data=this_df)         
    
    plt.title(f'PD risk and {gene_name} eQTL', fontsize='large')
    plt.xlabel('PD GWAS risk abs(score)')
    plt.ylabel(f'{gene_name} eQTL abs(score)')
    plt.show() 
    
    sns.lmplot(x='gwas_score_abs', 
               y='eqtl_score_abs', hue='eqtl_set', 
               palette="dark", height=12, data=this_df)

#   plot_out_file_name = f'{WRKDIR}/plink/images/{gene_name}.local_man.png'
#   plt.savefig(plot_out_file_name,format='png',dpi=600,bbox_inches='tight')
    
    return

In [None]:
plot_gwas_eqtl(gene_id, gene_name, chrom, gene_start, gene_stop, merged_df)

In [None]:
alpha_df = merged_df.loc[(merged_df['eqtl_pvalue'] < alpha_value) | 
                         (merged_df['gwas_pvalue'] < alpha_value)]
print(alpha_df.shape)

In [None]:
plot_gwas_eqtl(gene_id, gene_name, chrom, gene_start, gene_stop, alpha_df)

In [None]:
this_formula = 'gwas_score_abs ~ eqtl_score_abs'
grouping = 'eqtl_set'
alpha_df['eqtl_score_abs'].fillna(0, inplace=True)
result = mixed_model(this_formula, alpha_df, grouping)
print(result.summary())
term = 'eqtl_score_abs'
print(['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
print(gene, result.params[term], result.bse[term], 
      result.params.shape[0], result.pvalues[term])    


In [None]:
this_formula = 'gwas_score_abs ~ eqtl_score_abs'
reg_model = smf.ols(this_formula, data=alpha_df).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_score_abs ~ eqtl_score_abs + eqtl_set'
reg_model = smf.ols(this_formula, data=alpha_df).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_score_abs ~ eqtl_score_abs + eqtl_set + eqtl_score_abs * eqtl_set'
reg_model = smf.ols(this_formula, data=alpha_df).fit()
print(reg_model.summary())