In [83]:
# =============================================================================
# Cell 1: Configuration & Setup
# =============================================================================
# This cell contains all user settings and imports all necessary libraries.
# Modify the file paths and parameters below to match your analysis needs.
# =============================================================================

# --- 1. General Configuration ---

# 1a. File Paths
INPUT_FILE = "/Imputed/input/file/path"
OUTPUT_PATH_BASE = "/output/file/dir"
PATHWAY_FILE = "path/to/pathway/mapping.xlsx"
### PATHWAY_FILE = None ### If you want to skip this part of the analysis

# 1b. Metabolites to Exclude
METABOLITES_TO_EXCLUDE = ['9-Methylanthracene']

# 1c. Pretreatment Method
# Options: 'pareto', 'auto', 'log', 'log+pareto', 'log+auto'
PRETREATMENT_METHOD = 'pareto'


# --- 2. Sample & Timepoint Configuration ---

# 2a. Sample Naming Patterns
# Define how to identify conditions and timepoints from your sample names.
# Use a list of (regex, condition_name) tuples.
# The regex MUST include a named group `(?P<timepoint>\d+)` to capture the timepoint identifier.
SAMPLE_NAMING_PATTERNS = [
    (r'TM2A(?P<timepoint>\d+)_', '+ GFP'),  # Pattern for positive condition
    (r'TM2An(?P<timepoint>\d+)_', '- GFP')  # Pattern for negative condition
]

# 2b. Timepoint Mapping
# Map the captured timepoint identifier (from the regex) to a display name.
TIMEPOINT_MAP = {
    '1': '0h',
    '2': '0.5h',
    '3': '2h',
    '4': '5h',
    '5': '10h'
}

# 2c. Timepoint Plotting Order
# List the display names in the order you want them to appear in plots and legends.
TIMEPOINT_PLOT_ORDER = ['0h', '0.5h', '2h', '5h', '10h']


# --- 3. Pathway Analysis Configuration ---

# 3a. Pathway File Structure
PATHWAY_NAME_COLUMN = 0
METABOLITES_COLUMN = 2

# 3b. Metabolite Delimiter
METABOLITE_DELIMITER = ','

# 3c. Pathway Label Display Length
PATHWAY_LABEL_MAX_LENGTH = 70


# --- 4. Imports and Environment Setup ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import re
from matplotlib.patches import Ellipse
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.transforms as mtransforms
import warnings

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

print("--- Configuration and Setup Complete ---")

--- Configuration and Setup Complete ---


In [84]:
# =============================================================================
# Cell 2: Analysis Pipeline
# =============================================================================
# This cell contains all functions and the execution logic for the PCA.
# Do not modify this cell. Run it after setting your parameters in Cell 1.
# =============================================================================

# --- A. Data Loading and Pretreatment Functions ---

def load_data(filepath, exclude_metabolites=None):
    """Load Excel data with metabolites as rows and samples as columns"""
    print(f"Loading data from: {filepath}")
    try:
        df = pd.read_excel(filepath, index_col=0)
    except FileNotFoundError:
        print(f"ERROR: File not found at {filepath}. Please check the path in the configuration cell.")
        return None
    print(f"Initial data shape: {df.shape}")
    if exclude_metabolites:
        initial_count = df.shape[0]
        df = df.drop(index=exclude_metabolites, errors='ignore')
        excluded_count = initial_count - df.shape[0]
        print(f"Excluded {excluded_count} specified metabolite(s).")
    print(f"Final data shape: {df.shape}")
    return df

def log_transform(data):
    """Apply log1p transformation: log(1 + x)"""
    print("Applying log transformation...")
    return np.log1p(data).fillna(0)

def pareto_scaling(data):
    """Apply Pareto scaling: (x - mean) / sqrt(std)"""
    print("Applying Pareto scaling...")
    mean_values = data.mean(axis=1)
    std_values = data.std(axis=1, ddof=1)
    std_values[std_values == 0] = 1
    scaled_data = data.sub(mean_values, axis=0).div(np.sqrt(std_values), axis=0)
    return scaled_data.replace([np.inf, -np.inf], 0).fillna(0)

def auto_scaling(data):
    """Apply Auto scaling (Z-score): (x - mean) / std"""
    print("Applying Auto scaling...")
    mean_values = data.mean(axis=1)
    std_values = data.std(axis=1, ddof=1)
    std_values[std_values == 0] = 1
    scaled_data = data.sub(mean_values, axis=0).div(std_values, axis=0)
    return scaled_data.replace([np.inf, -np.inf], 0).fillna(0)

def apply_pretreatment(data, method):
    """Dispatcher function to apply the chosen pretreatment method."""
    print(f"\n--- Starting Data Pretreatment: {method} ---")
    if method.lower() == 'log': return log_transform(data)
    elif method.lower() == 'pareto': return pareto_scaling(data)
    elif method.lower() == 'auto': return auto_scaling(data)
    elif method.lower() == 'log+pareto': return pareto_scaling(log_transform(data))
    elif method.lower() == 'log+auto': return auto_scaling(log_transform(data))
    else:
        print(f"Warning: Pretreatment method '{method}' not recognized. Returning original data.")
        return data

# --- B. Core Analysis and Plotting Functions ---

def get_groups(sample_name, patterns, timepoint_map):
    """Parse sample names using a list of regex patterns."""
    for pattern, condition in patterns:
        match = re.search(pattern, sample_name)
        if match:
            try:
                timepoint_id = match.group('timepoint')
                timepoint_name = timepoint_map.get(timepoint_id, f"ID:{timepoint_id}")
                return timepoint_name, condition
            except IndexError:
                print(f"Warning: Regex pattern '{pattern}' is missing the named group '(?P<timepoint>...)'.")
                return "Unknown Timepoint", condition
    return "Unknown", "Unknown"


def perform_pca(scaled_data, n_components=20):
    """Perform PCA on scaled data."""
    print("\nPerforming PCA..."); data_for_pca = scaled_data.T
    n_components = min(n_components, data_for_pca.shape[0], data_for_pca.shape[1])
    pca = PCA(n_components=n_components); scores = pca.fit_transform(data_for_pca)
    pc_labels = [f'PC{i+1}' for i in range(n_components)]
    scores_df = pd.DataFrame(scores, columns=pc_labels, index=data_for_pca.index)
    loadings = pca.components_.T
    loadings_df = pd.DataFrame(loadings, columns=pc_labels, index=scaled_data.index)
    print(f"Explained variance ratio (Top 5): {pca.explained_variance_ratio_[:5]}")
    return pca, scores_df, loadings_df

# <<< FIX: Function now accepts total_top_metabolites to calculate the new fraction >>>
def perform_pathway_enrichment(top_metabolites, total_top_metabolites, pathway_file, name_col, met_col, delimiter):
    """Performs pathway analysis, calculating the fraction of top loadings per pathway."""
    if not pathway_file: return None
    print("\nPerforming pathway enrichment analysis...")
    try:
        pathway_df = pd.read_excel(pathway_file, header=None)
        pathway_results = []
        top_metabolites_set = {m.lower() for m in top_metabolites}
        for _, row in pathway_df.iterrows():
            pathway_name = row[name_col]
            if pd.isna(pathway_name): continue
            
            metabolite_str = str(row[met_col])
            pathway_metabolites = {m.strip().lower() for m in metabolite_str.split(delimiter) if m.strip()}
            hits = top_metabolites_set.intersection(pathway_metabolites)
            if hits:
                pathway_results.append({'Pathway': str(pathway_name).strip(), 'Hits': len(hits)})
        if not pathway_results:
            print("No pathway matches found."); return None
        
        results_df = pd.DataFrame(pathway_results)
        results_df['grouping_key'] = results_df['Pathway'].str.lower()
        agg_df = results_df.groupby('grouping_key').agg(
            Pathway=('Pathway', 'first'), Hits=('Hits', 'sum')
        ).reset_index(drop=True)
        
        # <<< FIX: Calculate the fraction of top loadings, not pathway coverage >>>
        agg_df['Fraction'] = agg_df['Hits'] / total_top_metabolites
        
        return agg_df.sort_values(by=['Hits', 'Fraction'], ascending=[False, False])
    except Exception as e:
        print(f"Error during pathway enrichment: {e}"); return None

# <<< FIX: Plotting function updated for the new 'Fraction' metric >>>
def plot_pathway_enrichment(pathway_counts, ax, title, max_len, total_top_metabolites):
    """Plots the fraction of top loadings represented in each pathway."""
    if pathway_counts is None or pathway_counts.empty:
        ax.text(0.5, 0.5, 'No Pathway Data Available', ha='center', va='center', fontsize=12); ax.axis('off'); return
    
    data_to_plot = pathway_counts.head(15).copy().sort_values('Fraction', ascending=True)
    bar_colors = plt.cm.viridis_r(np.linspace(0.1, 0.9, len(data_to_plot)))
    bars = ax.barh(y=data_to_plot['Pathway'], width=data_to_plot['Fraction'], color=bar_colors)
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel('Fraction of Top PC Loadings in Pathway', fontsize=12)
    ax.set_ylabel(None); ax.grid(axis='x', linestyle='--', alpha=0.6)
    
    labels = [f" {row['Hits']} / {total_top_metabolites}" for _, row in data_to_plot.iterrows()]
    ax.bar_label(bars, labels=labels, padding=3, fontsize=10)
    
    ax.set_yticklabels([(label.get_text()[:max_len-3] + '...') if len(label.get_text()) > max_len else label.get_text() for label in ax.get_yticklabels()])

def add_confidence_ellipse(ax, x, y, n_std=2.0, facecolor='none', **kwargs):
    """Add a confidence ellipse to a scatter plot."""
    if len(x) < 3: return
    cov = np.cov(x, y); pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])
    ell_radius_x, ell_radius_y = np.sqrt(1 + pearson), np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2, facecolor=facecolor, **kwargs)
    scale_x, mean_x = np.sqrt(cov[0, 0]) * n_std, np.mean(x)
    scale_y, mean_y = np.sqrt(cov[1, 1]) * n_std, np.mean(y)
    transf = mtransforms.Affine2D().rotate_deg(45).scale(scale_x, scale_y).translate(mean_x, mean_y)
    ellipse.set_transform(transf + ax.transData); ax.add_patch(ellipse)

def timepoint_specific_pca(data, metadata, timepoint, config):
    """Perform a complete PCA analysis for a single timepoint."""
    print(f"\nAnalyzing timepoint: {timepoint}")
    mask = metadata['timepoint'] == timepoint; timepoint_data = data.loc[:, mask]
    if timepoint_data.shape[1] < 3:
        print(f"Skipping {timepoint}: not enough samples."); return None, None, None, None, None
    scaled_data = apply_pretreatment(timepoint_data, config['pretreatment_method'])
    pca, scores_df, loadings_df = perform_pca(scaled_data, n_components=20)
    top_metabolites = list(set(loadings_df['PC1'].abs().nlargest(20).index.tolist() + loadings_df['PC2'].abs().nlargest(20).index.tolist()))
    total_top_metabolites = len(top_metabolites)
    pathway_results = perform_pathway_enrichment(
        top_metabolites, total_top_metabolites, config['pathway_file'], config['name_col'], config['met_col'], config['delimiter']
    )
    return pca, scores_df, loadings_df, pathway_results, top_metabolites

def create_comprehensive_pca_report(data, config):
    """Create a multi-page PDF report of the PCA results."""
    metadata = pd.DataFrame([get_groups(s, config['patterns'], config['tp_map']) for s in data.columns], columns=['timepoint', 'condition'], index=data.columns)
    timepoint_order = config['tp_order']
    color_dict = dict(zip(timepoint_order, plt.cm.plasma_r(np.linspace(0.1, 0.9, len(timepoint_order)))))
    
    scaled_data_overall = apply_pretreatment(data, config['pretreatment_method'])
    pca, scores_df, loadings_df = perform_pca(scaled_data_overall, n_components=20)
    
    top_metabolites_overall = list(set(loadings_df['PC1'].abs().nlargest(20).index.tolist() + loadings_df['PC2'].abs().nlargest(20).index.tolist()))
    total_top_metabolites_overall = len(top_metabolites_overall)
    
    pathway_results_overall = perform_pathway_enrichment(
        top_metabolites_overall, total_top_metabolites_overall, config['pathway_file'], config['name_col'], config['met_col'], config['delimiter']
    )
    
    pdf_filename = f"{config['output_path']}_PCA_Analysis_{config['pretreatment_method'].replace('+', '_')}.pdf"
    title_pretreatment = config['pretreatment_method'].replace("+", " + ").title()

    with PdfPages(pdf_filename) as pdf:
        print("\nGenerating plots for Overall Analysis...")
        # Page 1 & 2 are unchanged
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10), constrained_layout=True)
        for tp in timepoint_order:
            for cond, marker in [('+ GFP', 'o'), ('- GFP', 's')]:
                mask = (metadata['timepoint'] == tp) & (metadata['condition'] == cond)
                if mask.any(): ax1.scatter(scores_df.loc[mask, 'PC1'], scores_df.loc[mask, 'PC2'], color=color_dict.get(tp, 'gray'), marker=marker, s=150, alpha=0.8, edgecolors='black', label=f'{tp} ({cond})')
        for tp in timepoint_order:
            if (metadata['timepoint'] == tp).sum() > 2: add_confidence_ellipse(ax1, scores_df.loc[metadata['timepoint'] == tp, 'PC1'], scores_df.loc[metadata['timepoint'] == tp, 'PC2'], edgecolor=color_dict.get(tp, 'gray'), linewidth=2)
        ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})'); ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})'); ax1.set_title('Score Plot: PC1 vs PC2', fontsize=16, fontweight='bold'); ax1.grid(True, alpha=0.3); ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        pc_nums = np.arange(1, 11); ax2.bar(pc_nums, pca.explained_variance_ratio_[:10], color='steelblue'); ax2_twin = ax2.twinx(); ax2_twin.plot(pc_nums, np.cumsum(pca.explained_variance_ratio_[:10]), 'r-o'); ax2_twin.set_ylabel('Cumulative Variance Ratio'); ax2.set_ylabel('Explained Variance Ratio'); ax2.set_xlabel('Principal Component'); ax2.set_title('Scree Plot', fontsize=16, fontweight='bold'); ax2.set_xticks(pc_nums)
        fig.suptitle(f'Overall PCA Analysis: {title_pretreatment}', fontsize=20, fontweight='bold'); pdf.savefig(fig, bbox_inches='tight'); plt.close(fig)
        
        fig, (ax_load, ax_bi) = plt.subplots(1, 2, figsize=(22, 10), constrained_layout=True)
        top_loadings = np.sqrt(loadings_df['PC1']**2 + loadings_df['PC2']**2).nlargest(15).index; ax_load.scatter(loadings_df['PC1'], loadings_df['PC2'], alpha=0.6, c='gray')
        for met in top_loadings: ax_load.text(loadings_df.loc[met, 'PC1'], loadings_df.loc[met, 'PC2'], met, fontsize=8, ha='center', bbox=dict(facecolor='white', alpha=0.5, boxstyle='round,pad=0.2'))
        ax_load.set_xlabel('PC1 Loadings'); ax_load.set_ylabel('PC2 Loadings'); ax_load.set_title('Loading Plot (PC1 vs PC2)', fontweight='bold'); ax_load.axhline(0, c='grey', ls='--'); ax_load.axvline(0, c='grey', ls='--')
        for tp in timepoint_order:
            for cond, marker in [('+ GFP', 'o'), ('- GFP', 's')]:
                mask = (metadata['timepoint'] == tp) & (metadata['condition'] == cond)
                if mask.any(): ax_bi.scatter(scores_df.loc[mask, 'PC1'], scores_df.loc[mask, 'PC2'], color=color_dict.get(tp, 'gray'), marker=marker, s=50, alpha=0.5)
        scale_factor = 0.6 * np.max(np.abs(scores_df[['PC1', 'PC2']].values)) / np.max(np.abs(loadings_df.loc[top_loadings, ['PC1', 'PC2']].values))
        for met in top_loadings:
            ax_bi.arrow(0, 0, loadings_df.loc[met, 'PC1']*scale_factor, loadings_df.loc[met, 'PC2']*scale_factor, color='r', head_width=0.2)
            ax_bi.text(loadings_df.loc[met, 'PC1']*scale_factor*1.15, loadings_df.loc[met, 'PC2']*scale_factor*1.15, met, color='r', ha='center', va='center', fontsize=8)
        ax_bi.set_xlabel(f'PC1 Scores ({pca.explained_variance_ratio_[0]:.1%})'); ax_bi.set_ylabel(f'PC2 Scores ({pca.explained_variance_ratio_[1]:.1%})'); ax_bi.set_title('Biplot', fontweight='bold'); ax_bi.axhline(0, c='grey', ls='--'); ax_bi.axvline(0, c='grey', ls='--')
        fig.suptitle(f'Overall Loadings and Biplot: {title_pretreatment}', fontsize=20, fontweight='bold'); pdf.savefig(fig, bbox_inches='tight'); plt.close(fig)
        
        # --- Overall Page 3: Top Loadings & Pathway Plots ---
        fig = plt.figure(figsize=(18, 14), constrained_layout=True); gs = fig.add_gridspec(2, 2)
        ax1, ax2, ax3 = fig.add_subplot(gs[0, 0]), fig.add_subplot(gs[0, 1]), fig.add_subplot(gs[1, :])
        for ax, pc, N in [(ax1, 'PC1', 0), (ax2, 'PC2', 1)]:
            top15 = loadings_df[pc].abs().nlargest(15).index
            sns.barplot(x=loadings_df.loc[top15, pc], y=top15, ax=ax, palette='coolwarm')
            ax.set_title(f'Top 15 {pc} Loadings ({pca.explained_variance_ratio_[N]:.1%})', fontsize=14, fontweight='bold')
        plot_pathway_enrichment(pathway_results_overall, ax3, 'Pathway Representation in Top PC1 & PC2 Loadings',
                                max_len=config['max_label_len'], total_top_metabolites=total_top_metabolites_overall)
        fig.suptitle(f'Overall Feature Importance & Pathways: {title_pretreatment}', fontsize=20, fontweight='bold'); pdf.savefig(fig, bbox_inches='tight'); plt.close(fig)

        # --- Timepoint-Specific Analysis Pages ---
        for timepoint in timepoint_order:
            pca_tp, scores_df_tp, loadings_df_tp, pathway_results_tp, top_metabolites_tp = timepoint_specific_pca(data, metadata, timepoint, config)
            if pca_tp is None: continue
            total_top_metabolites_tp = len(top_metabolites_tp)

            print(f"Generating plots for Timepoint: {timepoint}...")
            # Page 1 & 2 for timepoints are unchanged
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10), constrained_layout=True); current_meta = metadata.loc[scores_df_tp.index]
            for cond, marker, color in [('+ GFP', 'o', 'blue'), ('- GFP', 's', 'red')]:
                mask = current_meta['condition'] == cond
                if mask.any(): ax1.scatter(scores_df_tp.loc[mask, 'PC1'], scores_df_tp.loc[mask, 'PC2'], c=color, marker=marker, s=150, edgecolors='k', label=cond); add_confidence_ellipse(ax1, scores_df_tp.loc[mask, 'PC1'], scores_df_tp.loc[mask, 'PC2'], edgecolor=color, linewidth=2)
            ax1.set_xlabel(f'PC1 ({pca_tp.explained_variance_ratio_[0]:.1%})'); ax1.set_ylabel(f'PC2 ({pca_tp.explained_variance_ratio_[1]:.1%})'); ax1.set_title('Score Plot: PC1 vs PC2', fontsize=16, fontweight='bold'); ax1.grid(True, alpha=0.3); ax1.legend()
            pc_nums_tp = np.arange(1, 11); ax2.bar(pc_nums_tp, pca_tp.explained_variance_ratio_[:10], color='steelblue'); ax2_twin = ax2.twinx(); ax2_twin.plot(pc_nums_tp, np.cumsum(pca_tp.explained_variance_ratio_[:10]), 'r-o'); ax2_twin.set_ylabel('Cumulative Variance Ratio'); ax2.set_ylabel('Explained Variance Ratio'); ax2.set_xlabel('Principal Component'); ax2.set_title('Scree Plot', fontsize=16, fontweight='bold'); ax2.set_xticks(pc_nums_tp)
            fig.suptitle(f'PCA for {timepoint}: {title_pretreatment}', fontsize=20, fontweight='bold'); pdf.savefig(fig, bbox_inches='tight'); plt.close(fig)
            
            fig, (ax_load, ax_bi) = plt.subplots(1, 2, figsize=(22, 10), constrained_layout=True); top_loadings_tp = np.sqrt(loadings_df_tp['PC1']**2 + loadings_df_tp['PC2']**2).nlargest(15).index; ax_load.scatter(loadings_df_tp['PC1'], loadings_df_tp['PC2'], alpha=0.6, c='gray')
            for met in top_loadings_tp: ax_load.text(loadings_df_tp.loc[met, 'PC1'], loadings_df_tp.loc[met, 'PC2'], met, fontsize=8, ha='center', bbox=dict(facecolor='white', alpha=0.5, boxstyle='round,pad=0.2'))
            ax_load.set_xlabel('PC1 Loadings'); ax_load.set_ylabel('PC2 Loadings'); ax_load.set_title('Loading Plot (PC1 vs PC2)', fontweight='bold'); ax_load.axhline(0, c='grey', ls='--'); ax_load.axvline(0, c='grey', ls='--')
            for cond, marker, color in [('+ GFP', 'o', 'blue'), ('- GFP', 's', 'red')]:
                mask = current_meta['condition'] == cond
                if mask.any(): ax_bi.scatter(scores_df_tp.loc[mask, 'PC1'], scores_df_tp.loc[mask, 'PC2'], c=color, marker=marker, s=80, alpha=0.6, label=cond)
            ax_bi.legend()
            scale_factor_tp = 0.6 * np.max(np.abs(scores_df_tp[['PC1', 'PC2']].values)) / np.max(np.abs(loadings_df_tp.loc[top_loadings_tp, ['PC1', 'PC2']].values))
            for met in top_loadings_tp:
                ax_bi.arrow(0, 0, loadings_df_tp.loc[met, 'PC1']*scale_factor_tp, loadings_df_tp.loc[met, 'PC2']*scale_factor_tp, color='r', head_width=0.2)
                ax_bi.text(loadings_df_tp.loc[met, 'PC1']*scale_factor_tp*1.15, loadings_df_tp.loc[met, 'PC2']*scale_factor_tp*1.15, met, color='r', ha='center', va='center', fontsize=8)
            ax_bi.set_xlabel(f'PC1 Scores ({pca_tp.explained_variance_ratio_[0]:.1%})'); ax_bi.set_ylabel(f'PC2 Scores ({pca_tp.explained_variance_ratio_[1]:.1%})'); ax_bi.set_title('Biplot', fontweight='bold'); ax_bi.axhline(0, c='grey', ls='--'); ax_bi.axvline(0, c='grey', ls='--')
            fig.suptitle(f'Loadings and Biplot for {timepoint}: {title_pretreatment}', fontsize=20, fontweight='bold'); pdf.savefig(fig, bbox_inches='tight'); plt.close(fig)

            # --- Timepoint Page 3: Top Loadings & Pathway Enrichment ---
            fig = plt.figure(figsize=(18, 14), constrained_layout=True); gs = fig.add_gridspec(2, 2)
            ax1, ax2, ax3 = fig.add_subplot(gs[0, 0]), fig.add_subplot(gs[0, 1]), fig.add_subplot(gs[1, :])
            for ax, pc, N in [(ax1, 'PC1', 0), (ax2, 'PC2', 1)]:
                top15 = loadings_df_tp[pc].abs().nlargest(15).index
                sns.barplot(x=loadings_df_tp.loc[top15, pc], y=top15, ax=ax, palette='coolwarm')
                ax.set_title(f'Top 15 {pc} Loadings ({pca_tp.explained_variance_ratio_[N]:.1%})', fontsize=14, fontweight='bold')
            plot_pathway_enrichment(pathway_results_tp, ax3, 'Pathway Representation in Top PC1 & PC2 Loadings',
                                    max_len=config['max_label_len'], total_top_metabolites=total_top_metabolites_tp)
            fig.suptitle(f'Feature Importance for {timepoint}: {title_pretreatment}', fontsize=20, fontweight='bold'); pdf.savefig(fig, bbox_inches='tight'); plt.close(fig)

    print(f"\nPDF report saved to: {pdf_filename}")
    excel_filename = f"{config['output_path']}_PCA_Results_{config['pretreatment_method'].replace('+', '_')}.xlsx"
    with pd.ExcelWriter(excel_filename) as writer:
        pd.concat([metadata, scores_df], axis=1).to_excel(writer, sheet_name='Scores_Overall')
        loadings_df.to_excel(writer, sheet_name='Loadings_Overall')
        pd.DataFrame({'Explained_Variance_Ratio': pca.explained_variance_ratio_, 'Cumulative_Variance': np.cumsum(pca.explained_variance_ratio_)}, index=[f'PC{i+1}' for i in range(pca.n_components_)]).to_excel(writer, sheet_name='Variance_Overall')
    print(f"Excel results saved to: {excel_filename}")

# --- C. Main Execution Block ---
print("\n\n>>> INITIATING PCA ANALYSIS PIPELINE <<<")
config = {
    "output_path": OUTPUT_PATH_BASE, "pathway_file": PATHWAY_FILE,
    "pretreatment_method": PRETREATMENT_METHOD, "patterns": SAMPLE_NAMING_PATTERNS,
    "tp_map": TIMEPOINT_MAP, "tp_order": TIMEPOINT_PLOT_ORDER,
    "name_col": PATHWAY_NAME_COLUMN, "met_col": METABOLITES_COLUMN,
    "delimiter": METABOLITE_DELIMITER, "max_label_len": PATHWAY_LABEL_MAX_LENGTH
}
metabolomics_data = load_data(INPUT_FILE, METABOLITES_TO_EXCLUDE)
if metabolomics_data is not None:
    create_comprehensive_pca_report(data=metabolomics_data, config=config)
    print("\n\n--- Analysis Complete! ---")
else:
    print("\n\n--- Analysis Halted due to data loading error. ---")



>>> INITIATING PCA ANALYSIS PIPELINE <<<
Loading data from: /Users/aranpurdy/desktop/CFPS/PCA/RF/MOD_RF_Imputed.xlsx
Initial data shape: (115, 50)
Excluded 1 specified metabolite(s).
Final data shape: (114, 50)

--- Starting Data Pretreatment: pareto ---
Applying Pareto scaling...

Performing PCA...
Explained variance ratio (Top 5): [0.3757615  0.17967533 0.09389609 0.089471   0.05766276]

Performing pathway enrichment analysis...

Generating plots for Overall Analysis...

Analyzing timepoint: 0h

--- Starting Data Pretreatment: pareto ---
Applying Pareto scaling...

Performing PCA...
Explained variance ratio (Top 5): [0.45880013 0.19786408 0.12004526 0.06779076 0.0508915 ]

Performing pathway enrichment analysis...
Generating plots for Timepoint: 0h...

Analyzing timepoint: 0.5h

--- Starting Data Pretreatment: pareto ---
Applying Pareto scaling...

Performing PCA...
Explained variance ratio (Top 5): [0.54483114 0.16928339 0.07604712 0.06533701 0.04805199]

Performing pathway enrich