## Notebook for running Gene Set Enrichment Analysis (GSEA) Enrichr using gseapy
[gseapy docs](https://gseapy.readthedocs.io/en/latest/introduction.html)

In [None]:
!date

#### import libraries

In [None]:
import gseapy
from gseapy.enrichr import Enrichr
from pandas import read_csv, DataFrame, concat, pivot
from math import ceil
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from seaborn import heatmap
import statsmodels.stats.multitest as smm
from numpy import log10, log
from os.path import exists
from time import sleep

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

In [None]:
# parameters
gene_set = 'GO_Cellular_Component_2021' # 'GO_Cellular_Component_2021', 'GO_Biological_Process_2021', 'KEGG_2021_Human', 'MSigDB_Hallmark_2020'

#### set notebook variables

In [None]:
# naming
cohort = 'foundin'
dx = 'PD'

# directories
wrk_dir = '/labshare/raph/datasets/foundin_qtl'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'
meta_dir = f'{wrk_dir}/meta'

# input files
full_qtl_tops_file = f'{meta_dir}/{cohort}_daNA_DAn-meta_metal_eqtl_top.csv'

# output files
figure_file = f'{figures_dir}/{cohort}.colocalization.{dx}.{gene_set}.gsea_enrichr.png'
results_file = f'{figures_dir}/{cohort}.colocalization.{dx}.{gene_set}.gsea_enrichr.csv'

# variables
DEBUG = False
dpi_value = 100
alpha = 0.05
# marker_sets = ['GO_Biological_Process_2021', 
#                'GO_Cellular_Component_2021', 
#                'KEGG_2021_Human']
modalities = ['DAn-meta',
              'Bryois-ExN', 'Bryois-InN', 'Bryois-Micro', 
              'Bryois-Astro', 'Bryois-Oligo', 'Bryois-Endo', 'Bryois-Peri']
min_h4 = 0.5
PAUSE_AMT = 2

### format a background gene list
here just using the genes test in the FOUNDIN-PD meta-DAn eQTL analysis

In [None]:
tops_df = read_csv(full_qtl_tops_file)
print(f'tops_df shape is {tops_df.shape}')
background_list = list(tops_df.trait.unique())
print(f'background gene list has {len(background_list)} genes in it')
if DEBUG:
    display(tops_df.sample(5))
    print(background_list[0:5])

### format each modality's gene list

In [None]:
%%time
gene_sets = {}
for modality in modalities:
    print(modality)
    in_file = f'{results_dir}/{cohort}_daNA_{modality}_{dx}.coloc.pp.csv'
    if exists(in_file):
        this_df = read_csv(in_file)
        print(f'loaded {this_df.shape[0]} results')
        # subset to min H4
        this_df = this_df.loc[this_df.H4 >= min_h4]
        print(f'kept {this_df.shape[0]} results')
        # get the list of genes to add
        gene_list = this_df.feature.to_list()
        gene_sets[modality] = gene_list
if DEBUG:
    display(gene_sets)

### check the enrichments

In [None]:
if DEBUG:
    gene_set_names = gseapy.get_library_name(organism='Human')
    print(gene_set_names)

In [None]:
def find_enrichment(name: str, genes: list, sets, 
                    background_genes: list, verbose: bool=False) -> DataFrame:
    enr_res = gseapy.enrichr(gene_list=genes,
                             organism='Human',
                             gene_sets=sets,
                             cutoff=0.5, background=background_genes)
    enr_res.results['modality'] = name    
    if verbose:
        print(f'full {sets} results shape{enr_res.results.shape}')        
        sig = enr_res.results.loc[enr_res.results['Adjusted P-value'] <= alpha]
        print(f'significant {sets} results shape{sig.shape}')
        display(sig)
    return enr_res.results

# compute B&H FDR for given p-values
def compute_fdr(pvalues):
    bh_adj = smm.fdrcorrection(pvalues)
    return bh_adj[1]

In [None]:
# for gene_set in marker_sets:
#     print(gene_set)

In [None]:
results = []
for modality, gene_list in gene_sets.items():
    print(f'\n########### {modality} ###########')
    # gene_list = list(set(gene_list) | set(monogenic_genes))
    # for gene_set in marker_sets:
    #     print(f'\n+++++++++++ {gene_set} +++++++++++')
    results.append(find_enrichment(modality, gene_list, gene_set, 
                                   background_list, verbose=False))
    sleep(PAUSE_AMT)

### convert full enrichment results into combined data frame

In [None]:
results_df = concat(results)
print(f'full results shape {results_df.shape}')
if DEBUG:
    display(results_df.sample(5))

### compute B&H FDR over combined enrichments

In [None]:
# apply B&H FDR corrections to results
results_df['bh_fdr'] = compute_fdr(results_df['P-value'].fillna(1))
print(f'updated shape {results_df.shape}')
if DEBUG:
    display(results_df.sample(10))

### save the results

In [None]:
results_df.to_csv(results_file)

### how many are statistically significant

In [None]:
print(results_df.loc[results_df.bh_fdr <= alpha].shape)
display(results_df.loc[results_df.bh_fdr <= alpha].sort_values('bh_fdr').head())

print(results_df.loc[results_df['Adjusted P-value'] <= alpha].shape)
# display(results_df.loc[results_df['Adjusted P-value'] <= alpha].sort_values('Combined Score', ascending=False).head())
display(results_df.loc[results_df['Adjusted P-value'] <= alpha].sort_values('Adjusted P-value').head())

### reshape the dataframe from long to wide

In [None]:
# temp_df = results_df.loc[(results_df.modality == 'DAn-meta') & 
#                          (results_df.bh_fdr <= alpha)]
# temp_df = results_df.loc[results_df.bh_fdr <= alpha]
temp_df = results_df.loc[results_df['Adjusted P-value'] <= alpha]
# temp_df = results_df.loc[results_df['Adjusted P-value'] <= alpha]
# compute -log10 of p-value
# results_df['log10_pvalue'] = -log10(results_df['P-value'])
# wcoloc_df = pivot(results_df.loc[results_df.Term.isin(temp_df.Term)], 
#                   index=['Term'], 
#                   columns=['modality'], values='log10_pvalue')
results_df['log_odds'] = log(results_df['Odds Ratio'])
wcoloc_df = pivot(results_df.loc[results_df.Term.isin(temp_df.Term)], 
                  index=['Term'], 
                  columns=['modality'], values='log_odds')
# set precision
wcoloc_df = wcoloc_df.round(2)
# drop rows that are all null
wcoloc_df.dropna(how='all', inplace=True)
wcoloc_df = wcoloc_df.sort_values(by=['DAn-meta'], ascending=False)
print(f'shape of wide reformated results {wcoloc_df.shape}')
if DEBUG:
    display(wcoloc_df)

### visualize the reformated data as a heatmap

In [None]:

if wcoloc_df.shape[0] > 9:
    height = 9+ceil(wcoloc_df.shape[0]/6)
else:
    height = 9
print(height)        
with rc_context({'figure.figsize': (11, height), 'figure.dpi': 50}):
    plt.style.use('seaborn-v0_8-bright')    
    heatmap(wcoloc_df, linecolor='grey', linewidths=0.05, cmap='Purples')    
    plt.title(f'{gene_set} GSEA Enrichr for {dx} colocalizations')
    plt.savefig(figure_file, dpi=dpi_value, bbox_inches='tight', 
                transparent=True, pad_inches=1)
    plt.show()

### visualize as clustered heatmap

In [None]:
from seaborn import clustermap

# fill the missing
wcoloc_df = wcoloc_df.fillna(0)

with rc_context({'figure.figsize': (11, height), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-bright')    
    # clustermap(wcoloc_df, cmap='Purples', cbar_pos=(0.75, 0.9, 0.05, 0.18))
    clustermap(wcoloc_df, cmap='Purples', cbar_pos=None, linecolor='grey', linewidths=0.05)        
    # plt.title('GSEA Enrichr')
    # plt.savefig(figure_file, dpi=dpi_value, bbox_inches='tight', 
    #             transparent=True, pad_inches=1)
    plt.show()

In [None]:
!date