In [None]:
# ============================================================
# Variable Clustering for Metabolomics Data (4 Scenarios)
# ============================================================
# This notebook performs variance filtering and variable clustering
# using VarClusHi on 4 metabolomics feature sets (scenarios).
# Scenario 1: metabolites only
# Scenario 2: metabolites + ratios
# Scenario 3: metabolites + inverse ratios
# Scenario 4: all combined
# ============================================================

# -------------------------
# Import Required Libraries
# -------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from varclushi import VarClusHi
import os

# -------------------------
# Define Scenario Settings
# -------------------------
scenarios = {
    "Scenario_1": {
        "filename": "combat_data_metabolites_only.xlsx",
        "variance_threshold": 0.0
    },
    "Scenario_2": {
        "filename": "combat_data_ratios.xlsx",
        "variance_threshold": 0.035
    },
    "Scenario_3": {
        "filename": "combat_data_inverse_ratios.xlsx",
        "variance_threshold": 0.035
    },
    "Scenario_4": {
        "filename": "combat_data_combined.xlsx",
        "variance_threshold": 0.035
    }
}

# -------------------------
# Process Each Scenario
# -------------------------
columns_to_exclude = ['Sex', 'Age', 'BMI', 'Progressor', 'p1', 'BatchID']

for name, settings in scenarios.items():
    print(f"\n=== Processing {name} ===")

    # Load data
    df = pd.read_excel(settings["filename"])
    df.columns = df.columns.str.replace('/', '__')

    # Subset data
    columns_to_cluster = [col for col in df.columns if col not in columns_to_exclude]
    df_for_clustering = df[columns_to_cluster]

    # Normalize to [0, 1] for variance threshold
    scaler_minmax = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler_minmax.fit_transform(df_for_clustering), columns=df_for_clustering.columns)

    # Apply Variance Threshold
    selector = VarianceThreshold(threshold=settings["variance_threshold"])
    selector.fit(df_scaled)
    selected_features = df_scaled.columns[selector.get_support()].tolist()
    df_filtered = df_scaled[selected_features]

    print(f"Shape after variance thresholding: {df_filtered.shape}")

    # Standardize features for clustering
    scaler_standard = StandardScaler()
    df_standardized = pd.DataFrame(scaler_standard.fit_transform(df_filtered), columns=df_filtered.columns)

    # Run VarClusHi
    vc_model = VarClusHi(df_standardized, maxeigval2=1, maxclus=None)
    vc_model.varclus()

    # Save full R² matrix
    r2_df = pd.DataFrame(vc_model.rsquare)
    r2_df.to_csv(f"{name}_all_clusters.csv", index=False)

    # Select representative variables (lowest RS_Ratio per cluster)
    grouped = r2_df.groupby("Cluster")
    min_idx = grouped["RS_Ratio"].idxmin()
    selected_vars = r2_df.loc[min_idx, "Variable"].tolist()

    print(f"Selected variables in {name}: {len(selected_vars)} features")

    # Save selected variable list and reduced dataframe
    selected_df = df_standardized[selected_vars]
    selected_df.to_csv(f"{name}_selected_features.csv", index=False)

print("\n✅ Variable clustering completed for all scenarios.")
