## Notebook for plotting feature specific feature QTL results

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# parameters
cohort = 'foundin'
feature = 'chr1_205849678_205850531'

In [None]:
# naming
# cohort_build = f'{cohort}.{day}'
geno_version = 'amppdv1'
cohort_version = f'{cohort}.{geno_version}'

# directories
home_dir = '/home/gibbsr'
wrk_dir = f'{home_dir}/{cohort}/caqtl'
quants_dir = f'{wrk_dir}/quants'
genos_dir = f'{wrk_dir}/genotypes'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
gwas_dir = f'{wrk_dir}/public'

# input files
features_file = f'{quants_dir}/{cohort}_consensus_peaks.saf'
meta5_stats_file = f'{gwas_dir}/pdmeta_sumstats_hg38.h5'

# output files

# constant values
alpha_value = 0.05
days = ['da0', 'da25', 'da65']

#### analysis functions

In [None]:
def mixed_model(formula, df, group_name):
    model = sm.MixedLM.from_formula(formula, df, groups=df[group_name])
    result = model.fit()
    return result

#### load the gencode annotations

In [None]:
%%time
features_df = pd.read_csv(features_file, sep='\t')
print(features_df.shape)
display(features_df.head())

#### load the full gwas summary stats

In [None]:
%%time
gwas_stats_df = pd.read_hdf(meta5_stats_file)
print(gwas_stats_df.shape)
display(gwas_stats_df.head())

#### load the cis-QTL results by day for the specified feature

In [None]:
%%time

feature_df = features_df.loc[features_df['GeneID'] == feature]
#     print(feature_df['seqname'].unique()[0])
feature_id = feature_df['GeneID'].unique()[0]
chrom = feature_df['Chr'].unique()[0]
feature_start = feature_df['Start'].min()
feature_stop = feature_df['End'].max()

feature_qtl_df = None
for day in days:
    # now load the chromosome qtl results and extract specific feature results
    chrom_file = f'{tensorqtl_dir}/{cohort}.{day}.*.cis_qtl_pairs.{chrom}.parquet'
    chrom_qtl_df = dd.read_parquet(chrom_file).compute()
    chrom_bim_df = pd.read_csv(f'{genos_dir}/{cohort_version}.{chrom}.bfile.bim', 
                               header=None, sep='\s+')
    chrom_bim_df.columns = ['chr', 'name', 'cm', 'pos', 'a1', 'a2']
    chrom_qtl_df = chrom_qtl_df.merge(chrom_bim_df, how='inner', 
                                    left_on='variant_id', right_on='name')    
#     print(chrom_qtl_df.shape)
    this_df = chrom_qtl_df.loc[chrom_qtl_df['phenotype_id'] == feature_id].copy()
    this_df['set'] = day
    feature_qtl_df = pd.concat([feature_qtl_df, this_df])
#     print(feature_qtl_df.shape)

#### get gwas stats for region of specified feature

In [None]:
region_gwas_df = gwas_stats_df.loc[gwas_stats_df['SNP'].isin(feature_qtl_df['variant_id'])].copy()
region_gwas_df['set'] = 'PDrisk'
print(region_gwas_df.shape)

#### subset appropriate columns and rename so that qtl and gwas can be concat'd

In [None]:
qtl_columns = ['variant_id', 'chr', 'pos', 'slope', 'slope_se', 'pval_nominal','set']
feature_qtl_df = feature_qtl_df[qtl_columns]
feature_qtl_df.columns = ['variant', 'chr', 'pos', 'beta', 'se', 'pvalue','set']
print(feature_qtl_df.shape)
# display(feature_qtl_df.head())

gwas_columns = ['SNP', 'chr', 'position', 'b', 'se', 'p','set']
region_gwas_df = region_gwas_df[gwas_columns]
region_gwas_df.columns = ['variant', 'chr', 'pos', 'beta', 'se', 'pvalue','set']
print(region_gwas_df.shape)
# display(region_gwas_df.head())

results_df = pd.concat([feature_qtl_df, region_gwas_df])
results_df['log10_pvalue'] = np.log10(results_df['pvalue'])*-1
results_df['score'] = results_df['beta']/results_df['se']
results_df['score_abs'] = np.abs(results_df['score'])
print(results_df.shape)
display(results_df.head())

#### now do the plotting

In [None]:
#plot local manhattan for feature qtl
def plot_qtl_manhattan(feature_id, feature_chrom, feature_start, feature_stop, 
                        results_df):
    print(f'{feature_id}')
    print(f'feature {feature_id} is on {feature_chrom} from {feature_start} to {feature_stop}')

    #pull in all results for the feature from chromosome for all days
    print(results_df.shape)

    #now actually do the plotting
    sns.set(style='darkgrid')
    sns.relplot(x='pos',y='log10_pvalue', hue='set',
                alpha=.5, palette="dark", height=12, data=results_df)        

    min_y = round(min(results_df['log10_pvalue']))

    plt.plot([feature_start, feature_stop], [min_y, min_y], linewidth=5, color='black')
    plt.text(feature_stop+10000, min_y, feature_id, fontsize='large', color='black')

    plt.title(f'{feature_id} qtl', fontsize='large') 
    plt.xlabel(f'Bp on chromosome {feature_chrom}')
    plt.ylabel(f'-log10(p-value)')    
    plt.show()

#   plot_out_file_name = f'{WRKDIR}/plink/images/{feature_id}.local_man.png'
#   plt.savefig(plot_out_file_name,format='png',dpi=600,bbox_inches='tight')
    
    return

In [None]:
plot_qtl_manhattan(feature_id, chrom, feature_start, feature_stop, 
                    results_df)

In [None]:
feature_qtl_df.columns = ['qtl_'+ x for x in feature_qtl_df.columns]
region_gwas_df.columns = ['gwas_'+ x for x in region_gwas_df.columns]
display(feature_qtl_df.head())
display(region_gwas_df.head())


In [None]:
merged_df = feature_qtl_df.merge(region_gwas_df, how='inner', 
                               left_on='qtl_variant', right_on='gwas_variant')
merged_df['gwas_score_abs'] = np.abs(merged_df['gwas_beta']/merged_df['gwas_se'])
merged_df['qtl_score_abs'] = np.abs(merged_df['qtl_beta']/merged_df['qtl_se'])
# ensure day/qtl_set is object data type instead of int so treated categorical
merged_df['qtl_set'] =  merged_df['qtl_set'].astype(object)
print(merged_df.shape)
display(merged_df.head())

In [None]:
# sns.relplot(x='gwas_beta',y='qtl_beta', hue='qtl_set', size=np.abs(merged_df['gwas_beta']),
#             alpha=.5, palette="dark", height=12, data=merged_df) 

In [None]:
#plot local manhattan for feature QTL
def plot_gwas_qtl(feature_id, feature_chrom, feature_start, feature_stop, 
                   this_df):
    print(f'{feature_id}')
    print(f'feature {feature_id} is on {feature_chrom} from {feature_start} to {feature_stop}')

    #pull in all results for the feature from chromosome for all days
    print(this_df.shape)

    #now actually do the plotting
    sns.set(style='darkgrid')
    sns.relplot(x=np.log10(this_df['gwas_pvalue'])*-1, 
                y=np.log10(this_df['qtl_pvalue'])*-1, 
                hue='qtl_set', alpha=.5, palette="dark", 
                height=12, data=this_df)         
    
    plt.title(f'PD risk and {feature_id} QTL', fontsize='large')
    plt.xlabel('PD GWAS risk -log10(p-value)')
    plt.ylabel(f'{feature_id} QTL -log10(p-value)')
    plt.show()
    
    sns.relplot(x='gwas_score_abs', 
                y='qtl_score_abs', 
                hue='qtl_set', alpha=.5, palette="dark", 
                height=12, data=this_df)         
    
    plt.title(f'PD risk and {feature_id} QTL', fontsize='large')
    plt.xlabel('PD GWAS risk abs(score)')
    plt.ylabel(f'{feature_id} QTL abs(score)')
    plt.show() 
    
    sns.lmplot(x='gwas_score_abs', 
               y='qtl_score_abs', hue='qtl_set', 
               palette="dark", height=12, data=this_df)

#   plot_out_file_name = f'{WRKDIR}/plink/images/{feature_id}.local_man.png'
#   plt.savefig(plot_out_file_name,format='png',dpi=600,bbox_inches='tight')
    
    return

In [None]:
plot_gwas_qtl(feature_id, chrom, feature_start, feature_stop, merged_df)

In [None]:
alpha_df = merged_df.loc[(merged_df['qtl_pvalue'] < alpha_value) | 
                         (merged_df['gwas_pvalue'] < alpha_value)]
print(alpha_df.shape)

In [None]:
plot_gwas_qtl(feature_id, chrom, feature_start, feature_stop, alpha_df)

In [None]:
this_formula = 'gwas_score_abs ~ qtl_score_abs'
grouping = 'qtl_set'
result = mixed_model(this_formula, alpha_df, grouping)
print(result.summary())
term = 'qtl_score_abs'
print(['feature', 'coef', 'stderr', 'term_cnt', 'p-value'])
print(feature, result.params[term], result.bse[term], 
      result.params.shape[0], result.pvalues[term])    


In [None]:
this_formula = 'gwas_score_abs ~ qtl_score_abs'
reg_model = smf.ols(this_formula, data=alpha_df).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_score_abs ~ qtl_score_abs + qtl_set'
reg_model = smf.ols(this_formula, data=alpha_df).fit()
print(reg_model.summary())

In [None]:
this_formula = 'gwas_score_abs ~ qtl_score_abs + qtl_set + qtl_score_abs * qtl_set'
reg_model = smf.ols(this_formula, data=alpha_df).fit()
print(reg_model.summary())

In [None]:
alpha_df['qtl_set'].head()

In [None]:
alpha_df['daynum'] = alpha_df['qtl_set'].str.replace('da','').astype('int32')
this_formula = 'gwas_score_abs ~ qtl_score_abs + (1|daynum)'
reg_model = smf.ols(this_formula, data=alpha_df).fit()
print(reg_model.summary())