In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================
# Setup
# ============================================

np.random.seed(100)
tf.random.set_seed(100)

threshold = 0.43
class_weight = {0: 1, 1: 1}

# Dummy placeholder for top_features_dict — replace with actual top 10 features
top_features_dict = {
    "Scenario 1": [...],  # Replace with top 10 features
    "Scenario 2": [...],
    "Scenario 3": [...],
    "Scenario 4": [...]
}

# Define file paths for each scenario
scenarios = {
    "Scenario 1": {
        "train": "df_metabol.xlsx",
        "validation": "combat_validation_no_scaling2.xlsx",
        "TASOAC": "combat_data_no_scaling2.xlsx",
        "title": "Metabolomic features only"
    },
    "Scenario 2": {
        "train": "df_ratio2.xlsx",
        "validation": "combat_validation_no_scaling2_ratio.xlsx",
        "TASOAC": "combat_data_no_scaling3_ratio.xlsx",
        "title": "Metabolomic features + ratios"
    },
    "Scenario 3": {
        "train": "df_inverse_ratios2.xlsx",
        "validation": "combat_validation_no_scaling2_inverse_ratio.xlsx",
        "TASOAC": "combat_data_no_scaling3_inverse_ratio.xlsx",
        "title": "Metabolomic features + inverse ratios"
    },
    "Scenario 4": {
        "train": "df_allratios2.xlsx",
        "validation": "combat_validation_no_scaling2_allratio.xlsx",
        "TASOAC": "combat_data_no_scaling3_all_ratios.xlsx",
        "title": "Metabolomic features + all ratios"
    }
}

# ML/DL models
models = {
    "ANN": {
        "type": "ann",
        "params": {
            "units": [8, 16, 32],                
            "dropout": [0.0, 0.2, 0.5],          
            "batch_size": [8, 16, 32],           
            "epochs": [20, 30, 50],              
            "learning_rate": [0.001, 0.0005, 0.01]  
        }
    },
    "Logistic Regression": {
        "type": "ml",
        "model": LogisticRegression(random_state=123, class_weight=class_weight, max_iter=1000),
        "param_grid": [
            {
                'penalty': ['l1', 'l2'],
                'C': [0.01, 0.1, 1, 10, 100],
                'solver': ['saga']
            },
            {
                'penalty': ['elasticnet'],
                'C': [0.01, 0.1, 1, 10, 100],
                'solver': ['saga'],
                'l1_ratio': [0, 0.3, 0.7, 1]
            }
        ]
    },
    "SVM": {
        "type": "ml",
        "model": SVC(probability=True, class_weight=class_weight),
        "param_grid": {
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        }
    },
    "Random Forest": {
        "type": "ml",
        "model": RandomForestClassifier(class_weight='balanced'),
        "param_grid": {
            'n_estimators': [100],
            'max_depth': [None, 10],
            'min_samples_split': [2],
            'min_samples_leaf': [1]
        }
    },
    "XGBoost": {
        "type": "ml",
        "model": XGBClassifier(eval_metric='logloss', scale_pos_weight=3),
        "param_grid": {
            'n_estimators': [100],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1],
            'subsample': [0.8],
            'colsample_bytree': [0.8]
        }
    }
}

# Calculate sensitivity and specificity
def calculate_sen_spe(y_true, y_prob, threshold=0.43):
    y_pred = (y_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    return sensitivity, specificity

# ============================================
# Loop over scenarios
# ============================================

results = []

for scen_key, scen_info in scenarios.items():
    print(f"\nProcessing {scen_key}...")

    # Load and clean
    train = pd.read_excel(scen_info["train"]).rename(columns=lambda x: x.replace('/', '__'))
    val = pd.read_excel(scen_info["validation"]).rename(columns=lambda x: x.replace('/', '__'))

    # Target assignment
    train['Progressor'] = np.where(train['p1'] > threshold, 1, 0)
    val['Progressor'] = np.where(val['p1'] > threshold, 1, 0)
    train.drop(columns='p1', inplace=True)
    val.drop(columns='p1', inplace=True)

    # Feature selection
    features = top_features_dict[scen_key] + ['Progressor']
    train = train[features]
    val = val[features]

    X = train.drop(columns='Progressor')
    y = train['Progressor']
    X_val = val.drop(columns='Progressor')
    y_val = val['Progressor']

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    X_val_scaled = scaler.transform(X_val)

    # SMOTE
    sm = SMOTE(random_state=42, sampling_strategy={0: 1500, 1: 1500})
    X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

    for model_name, model_def in models.items():
        print(f" Training {model_name}...")

        # Metrics init
        test_auc = val_auc = test_acc = val_acc = test_sen = test_spe = val_sen = val_spe = np.nan

        try:
            if model_def["type"] == "ann":
                p = model_def["params"]
                model = Sequential([
                    Input(shape=(X_train_res.shape[1],)),
                    Dense(p['units'], activation='relu'),
                    Dropout(p['dropout']),
                    Dense(1, activation='sigmoid')
                ])
                model.compile(
                    optimizer=tf.keras.optimizers.Adam(learning_rate=p['learning_rate']),
                    loss='binary_crossentropy',
                    metrics=['accuracy']
                )
                model.fit(X_train_res, y_train_res,
                          epochs=p['epochs'], batch_size=p['batch_size'], verbose=0)

                y_test_prob = model.predict(X_test_scaled).flatten()
                y_val_prob = model.predict(X_val_scaled).flatten()

            else:
                grid = GridSearchCV(model_def["model"], model_def["param_grid"], scoring='roc_auc', cv=3)
                grid.fit(X_train_res, y_train_res)
                best_model = grid.best_estimator_
                y_test_prob = best_model.predict_proba(X_test_scaled)[:, 1]
                y_val_prob = best_model.predict_proba(X_val_scaled)[:, 1]

            # Test set metrics
            y_test_pred = (y_test_prob >= threshold).astype(int)
            test_auc = roc_auc_score(y_test, y_test_prob)
            test_acc = accuracy_score(y_test, y_test_pred)
            test_sen, test_spe = calculate_sen_spe(y_test, y_test_prob)

            # Validation set metrics
            y_val_pred = (y_val_prob >= threshold).astype(int)
            val_auc = roc_auc_score(y_val, y_val_prob)
            val_acc = accuracy_score(y_val, y_val_pred)
            val_sen, val_spe = calculate_sen_spe(y_val, y_val_prob)

        except Exception as e:
            print(f"Error in {model_name}: {e}")

        for dataset, auc, acc, sen, spe in [
            ("Test", test_auc, test_acc, test_sen, test_spe),
            ("Validation", val_auc, val_acc, val_sen, val_spe)
        ]:
            results.append({
                "Scenario": scen_key,
                "Model": model_name,
                "Dataset": dataset,
                "AUC": auc,
                "Accuracy": acc,
                "Sensitivity": sen,
                "Specificity": spe
            })

# ============================================
# Save & Plot Results
# ============================================

results_df = pd.DataFrame(results)
results_df.to_excel("model_results.xlsx", index=False)
print("\nFinal Results:\n", results_df.round(3).to_markdown(index=False))
