In [None]:
import numpy as np
import pandas as pd

def sort_filter_df(df, sort_by='padj', only_significant=True, pvalue_type='padj'):
    """
    Sorts and filters dataframes, especially for enricher analysis results.

    Parameters:
    - df: DataFrame to process.
    - sort_by: Criteria to sort the DataFrame. Options are 'padj', 'pval', 'percgenesinvolved', 'genesinvolved'.
    - only_significant: If True, filters rows where p-value type is <= 0.05.
    - pvalue_type: Type of p-value to consider. Options are 'padj' (Adjusted P-value) or 'pval' (P-value).

    Returns:
    - Processed DataFrame sorted and optionally filtered based on the given criteria.
    """

    # Map pvalue_type to DataFrame column name
    pvalue_col = 'Adjusted P-value' if pvalue_type == 'padj' else 'P-value' if pvalue_type == 'pval' else None
    if pvalue_col is None:
        raise ValueError("pvalue_type must be 'padj' or 'pval'.")

    # Filter by significance if requested
    if only_significant:
        df = df[df[pvalue_col] <= 0.05]

    # Add calculated columns
    df['-log(%s)' % pvalue_type] = -np.log(df[pvalue_col])
    df['Genes involved (%)'] = df['Overlap'].apply(lambda a: 100 * (int(a.split('/')[0]) / int(a.split('/')[1])))

    # Determine sorting
    sort_columns = {
        'percgenesinvolved': ('Genes involved (%)', False),
        'genesinvolved': ('genesinvolved', False),
        'padj': ('Adjusted P-value', True),
        'pval': ('P-value', True)
    }

    if sort_by in ['genesinvolved', 'percgenesinvolved']:
        df['genesinvolved'] = df['Overlap'].apply(lambda a: int(a.split('/')[0]))

    if sort_by in sort_columns:
        sort_col, ascending = sort_columns[sort_by]
        df = df.sort_values(by=sort_col, ascending=ascending)
    else:
        raise ValueError("Invalid sort_by value. Choose from 'padj', 'pval', 'percgenesinvolved', 'genesinvolved'.")

    return df



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
import numpy as np
import re

def plot_enrichr2_R_boxplot2(dModules, df_in, dfHubs, df_de=None, filter_go=True, convertname=None, ordered=None, verbose=0, nHubGenes=None, name_out=None, outfolder=None, title=None, pvalue_type='padj', figsize=(8,3), color=None, nterms=10, fontsize=10, exclude_go=False, sort_by='pvalue'):
    """
    Plots enrichment analysis results with both bar plots and box plots.

    Parameters:
    - dModules: Dictionary containing module data to be plotted.
    - df_in: DataFrame containing input data for box plots.
    - dfHubs: DataFrame containing hub genes information.
    - df_de: Optional DataFrame containing differential expression results.
    - filter_go, convertname, ordered, verbose, nHubGenes: Various options for processing and displaying results.
    - name_out, outfolder, title: Options for output naming, location, and titling.
    - pvalue_type, figsize, color, nterms, fontsize: Plot customization options.
    - exclude_go, sort_by: Options for excluding GO terms and sorting results.
    """

    dRes_DE = dict()
    lModules = ordered if ordered else list(dModules.keys())
    lModules_significant = []

    # Filter out non-significant modules
    for module_ in lModules:
        df = sort_filter_df_R(dModules[module_], sort_by=sort_by, only_significant=True, pvalue_type=pvalue_type, filter_go=filter_go)
        if not df.empty:
            lModules_significant.append(module_)

    # Setup plot dimensions based on significant modules
    nrows = len(lModules_significant)
    ncols = 2 if df_in is not None else 1
    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, squeeze=False)

    # Iterate through significant modules for plotting
    for i, module in enumerate(lModules_significant):
        df = sort_filter_df_R(dModules[module], sort_by=sort_by, only_significant=True, pvalue_type=pvalue_type, filter_go=filter_go)
        
        if not df.empty:
            nterms2 = min(nterms, df.shape[0])
            df_head = df.head(nterms2)

            # Bar plot for Genes involved (%)
            sns.barplot(x='Genes involved (%)', y='Term', data=df_head, orient='h', color=color, ax=axs[i, 0])
            axs[i, 0].set_xlabel('Genes involved (%)', fontsize=fontsize)
            axs[i, 0].set_ylabel('')
            axs[i, 0].tick_params(axis='both', which='major', labelsize=fontsize)
            
            # Title for the plot
            if convertname and module in convertname:
                axs[i, 0].set_title(convertname[module], fontsize=fontsize)
            else:
                axs[i, 0].set_title(module, fontsize=fontsize)

            # Boxplot section if df_in is provided
            if df_in is not None:
                lGenes = dfHubs[module].tolist()
                df_temp = df_in.loc[lGenes]
                df_temp_t = df_temp.T
                df_temp_t['Tissue'] = [x.split('_')[1] for x in df_temp_t.index]
                df_temp_t_melt = pd.melt(df_temp_t, id_vars=['Tissue'], var_name='Gene', value_name='log2FoldChange')

                sns.boxplot(x='Tissue', y='log2FoldChange', data=df_temp_t_melt, palette="Set2", ax=axs[i, 1])
                axs[i, 1].tick_params(axis='both', which='major', labelsize=fontsize)
                axs[i, 1].set_xlabel('')
                axs[i, 1].set_ylabel('log2(Fold Change)', fontsize=fontsize)
                axs[i, 1].set_xticklabels(axs[i, 1].get_xticklabels(), rotation=45)
                sns.despine()

    if title:
        plt.suptitle(title, fontsize=fontsize)

        # Adjust the layout to accommodate the title and overall plot aesthetics
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])

    # Save the figure to a file if a name and output folder are provided
    if name_out and outfolder:
        # Ensure the output directory exists
        os.makedirs(outfolder, exist_ok=True)
        # Save the figure
        fig_path = os.path.join(outfolder, f"{name_out}.pdf")
        fig.savefig(fig_path, bbox_inches='tight')
        print(f"Figure saved to {fig_path}")

    # Optional verbose output for debugging or detailed analysis
    if verbose > 0:
        print("Detailed overlaps and module information:")
        for module in lModules_significant:
            print(f"Module: {module}")
            # This assumes sort_filter_df_R returns a DataFrame with a 'Genes' column among others
            df = sort_filter_df_R(dModules[module], sort_by=sort_by, only_significant=True, pvalue_type=pvalue_type, filter_go=filter_go)
            for _, row in df.iterrows():
                genes = set(row['Genes'].split(';'))
                hub_overlap = genes.intersection(set(dfHubs[module]))
                de_overlap = genes.intersection(set(df_de.index)) if df_de is not None else set()
                print(f"Term: {row['Term']}, Hub Genes Overlap: {len(hub_overlap)}, DEGs Overlap: {len(de_overlap)}")

    return dRes_DE

