In [None]:
import pandas as pd
import pathlib
import gseapy as gp
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
manifest_path = "E:/YandexDisk/pydnameth/datasets/GPL13534/manifest"
manifest_pkl = f"{manifest_path}/manifest.pkl"
manifest = pd.read_pickle(manifest_pkl)
manifest['CHR'] = manifest['CHR'].str[3::]

In [None]:
gsea_libs_all = pd.Series(gp.get_library_name("Human"))
gsea_libs_trgt = gsea_libs_all[gsea_libs_all.str.contains(r'^GO_.*$', regex=True)].values
gsea_cols = ["Gene_set", "Term", "Overlap", "P-value", "Adjusted P-value", "Odds Ratio", "Combined Score"]

In [None]:
path_cpg = "E:/YandexDisk/DNAm draft/epi_clocks/"
clocks_cpgs = {'Hannum': [], 'Horvath': [], 'SkinBlood': [], 'PhenoAge': [], 'DunedinPACE': []}
for key in clocks_cpgs:
    pathlib.Path(f"{path_cpg}/{key}").mkdir(parents=True, exist_ok=True)
    cpg_xlsx = f"{path_cpg}/{key}_cpgs.xlsx"
    cpgs = pd.read_excel(cpg_xlsx)
    cpg_list = list(cpgs['cpgs'])
    genes_selected = set()
    for cpg in cpg_list:
        genes_raw = manifest.at[cpg, 'Gene']
        if isinstance(genes_raw, str):
            genes = genes_raw.split(';')
            genes_selected.update(set(genes))
    if 'non-genic' in genes_selected:
        genes_selected.remove('non-genic')
    if ' ' in genes_selected:
        genes_selected.remove(' ')
    genes_selected = list(genes_selected)
    genes_df = pd.DataFrame({'gene':genes_selected})
    genes_df.to_excel(f"{path_cpg}/{key}/genes.xlsx", index=False)
    
    pathlib.Path(f"{path_cpg}/{key}/GSEA").mkdir(parents=True, exist_ok=True)
    libraries = gp.get_library_name("Human")
    df_libraries = pd.DataFrame(index=libraries)
    df_libraries.to_excel(f"{path_cpg}/{key}/GSEA/libraries.xlsx", index=True)
    
    dfs_enrichr = []
    for gsea_lib in gsea_libs_trgt:
        pathlib.Path(f"{path_cpg}/{key}/{gsea_lib}").mkdir(parents=True, exist_ok=True)
        df_enrichr = gp.enrichr(
            gene_list=genes_selected,
            gene_sets=gsea_lib,
            organism='Human',
            outdir=f"{path_cpg}/{key}/{gsea_lib}",
            cutoff=1.00,
            verbose=True,
            no_plot=True
        )
        dfs_enrichr.append(df_enrichr.results)
    dfs_enrichr = pd.concat(dfs_enrichr)
    dfs_enrichr.to_excel(f"{path_cpg}/{key}/GSEA/results.xlsx", index=True)
    dfs_enrichr.to_pickle(f"{path_cpg}/{key}/GSEA/results.pkl")
        
    dfs_enrichr = dfs_enrichr.loc[dfs_enrichr["Adjusted P-value"] < 0.05, gsea_cols]
    dfs_enrichr.sort_values(["Adjusted P-value"], ascending=[True], inplace=True)
    dfs_enrichr = dfs_enrichr.head(n=50)
    dfs_enrichr.index = range(len(dfs_enrichr))
    
    if not dfs_enrichr.empty:
        dfs_enrichr[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(dfs_enrichr.loc[:, 'Adjusted P-value'].values)
        dfs_enrichr.rename(columns={'Gene_set': 'Gene Library'}, inplace=True)
        plt.figure(figsize=(10, 0.5 * dfs_enrichr.shape[0]))
        sns.set_theme(style='whitegrid', font_scale=2)
        bar = sns.barplot(
            data=dfs_enrichr,
            hue="Gene Library",
            y=dfs_enrichr.index,
            x=r'$ -\log_{10}(\mathrm{p-value})$',
            palette=list(px.colors.qualitative.Alphabet) + list(px.colors.qualitative.Dark24) + list(
                px.colors.qualitative.Light24),
            edgecolor='black',
            orient="h",
            dodge=False
        )
        bar.set_yticklabels(dfs_enrichr["Term"])
        sns.move_legend(bar, "upper left", bbox_to_anchor=(1, 1))
        plt.savefig(f"{path_cpg}/{key}/terms.png", bbox_inches='tight')
        plt.savefig(f"{path_cpg}/{key}/terms.pdf", bbox_inches='tight')
        plt.close()