In [None]:
!pip install tensorflow

In [None]:
# @title 1. Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Set seeds for reproducibility (Crucial for scientific reporting)
np.random.seed(42)
import tensorflow as tf
tf.random.set_seed(42)

print("Libraries loaded successfully.")

In [None]:
# @title 2. Load Data and Initial Preprocessing
import pandas as pd
import os

# Define the data directory relative to this script
data_dir = './data'
# Update the filename to the .zip version
file_name = 'asv_interpretability_dataset_modified.zip'
file_path = os.path.join(data_dir, file_name)

# Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found at {file_path}. Please ensure the 'data' folder contains the zipped dataset.")

# Load the data directly from the zip file
# Pandas automatically detects the zip compression
df = pd.read_csv(file_path, dtype={'PatientID': str})
print(f"Successfully loaded data from {file_path}")

# --- Helper: Handle NeutrophilCount with '<0.1' values ---
def parse_neutrophil(value):
    try:
        return float(value)
    except:
        if isinstance(value, str) and "<" in value:
            threshold = float(value.replace("<", "").strip())
            return threshold / 2
        return np.nan

# --- Helper: Create Proxy Labels (Clinical Dysbiosis) ---
def label_dysbiosis(row):
    # Proxy definition: High Temp + Low Neutrophils + Liquid Stool
    is_temp_abnormal = row['MaxTemperature'] > 38.0
    is_neutro_low = row['NeutrophilCount'] < 500
    is_consistency_liquid = row.get('Consistency_liquid', 0) == 1
    return int(is_temp_abnormal and is_neutro_low and is_consistency_liquid)

# 1. Clean Neutrophils
df['NeutrophilCount'] = df['NeutrophilCount'].apply(parse_neutrophil)
# Impute missing neutrophils with median (optional, depends on your specific logic)
# df['NeutrophilCount'].fillna(df['NeutrophilCount'].median(), inplace=True)

# 2. One-hot encode stool consistency
df = pd.get_dummies(df, columns=['Consistency'])

# 3. Log transform Genus-relative abundances (Compositional handling)
# Ensure no negative values or zeros break the log
df['RelativeAbundance'] = df['RelativeAbundance'].astype(float)
df['RelativeAbundance'] = np.log1p(df['RelativeAbundance'])

# 4. Generate Labels (Row-wise)
df['DysbiosisLabel'] = df.apply(label_dysbiosis, axis=1)

# 5. Pivot to Wide Format (Time Series Format)
# Keep metadata
metadata_cols = ['PatientID', 'SampleID', 'DayRelativeToNearestHCT',
                 'MaxTemperature', 'NeutrophilCount'] + \
                [col for col in df.columns if col.startswith('Consistency_')] + \
                ['DysbiosisLabel']

# Pivot Genus
genus_pivot = df.pivot_table(index=['PatientID', 'SampleID', 'DayRelativeToNearestHCT'],
                             columns='Genus', values='RelativeAbundance', fill_value=0).reset_index()

# Merge Metadata back
metadata = df[metadata_cols].drop_duplicates(subset=['PatientID', 'SampleID', 'DayRelativeToNearestHCT'])
merged_df = pd.merge(genus_pivot, metadata, on=['PatientID', 'SampleID', 'DayRelativeToNearestHCT'], how='left')

print(f"Data Processed. Total Samples: {len(merged_df)}")
print(f"Total Unique Patients: {merged_df['PatientID'].nunique()}")

In [None]:
# @title 3. Patient-Level Splitting & Scaling (PREVENT DATA LEAKAGE)

# --- STEP A: Split Patients First ---
# We split the Patient IDs, NOT the sequences.
unique_patients = merged_df['PatientID'].unique()
np.random.shuffle(unique_patients) # Randomize patient order

n_total = len(unique_patients)
n_train = int(0.70 * n_total)
n_val = int(0.15 * n_total)

train_pids = unique_patients[:n_train]
val_pids = unique_patients[n_train : n_train + n_val]
test_pids = unique_patients[n_train + n_val:]

print(f"Patients in Train: {len(train_pids)}")
print(f"Patients in Val:   {len(val_pids)}")
print(f"Patients in Test:  {len(test_pids)}")

# Create separate DataFrames based on Patient ID
df_train = merged_df[merged_df['PatientID'].isin(train_pids)].copy()
df_val = merged_df[merged_df['PatientID'].isin(val_pids)].copy()
df_test = merged_df[merged_df['PatientID'].isin(test_pids)].copy()

# --- STEP B: Feature Selection (Train Only) ---
# Identify genus columns
all_genus_cols = genus_pivot.columns.drop(['PatientID', 'SampleID', 'DayRelativeToNearestHCT']).tolist()

# Calculate variance ONLY on Training data to avoid leakage
train_variances = df_train[all_genus_cols].var()
# Drop columns with near-zero variance in training set
non_zero_var_cols = train_variances[train_variances > 1e-6].index.tolist()

# Define final feature list (Microbiome + Stool Consistency)
# Exclude Temp/Neutrophils from Input X (as they define the label Y)

#Changed on 02 December, 2025
# feature_cols = non_zero_var_cols + [col for col in merged_df.columns if 'Consistency' in col]

# === CRITICAL FIX ===
# Input Features = Microbiome ONLY.
# We REMOVE 'Consistency' because it is part of the Label definition (Leakage).
feature_cols = non_zero_var_cols
# feature_cols += [col for col in merged_df.columns if 'Consistency' in col] <--- REMOVED THIS LINE

print(f"Selected {len(feature_cols)} features (Microbiome Genus Only).")

#print(f"Selected {len(feature_cols)} features based on Training Set variance.")

# --- STEP C: Scaling (Fit on Train Only) ---
scaler = MinMaxScaler()

# 1. FIT and TRANSFORM on Training
df_train[feature_cols] = scaler.fit_transform(df_train[feature_cols])

# 2. TRANSFORM Only on Val/Test (using Train statistics)
df_val[feature_cols] = scaler.transform(df_val[feature_cols])
df_test[feature_cols] = scaler.transform(df_test[feature_cols])

print("Scaling complete. Data leakage prevented.")

In [None]:
# @title 4. Sequence Generation (Sliding Window)

def build_sequences(df, feature_cols, label_col='DysbiosisLabel', seq_len=14):
    """
    Generates sequences strictly within patient groups.
    """
    X_sequences = []
    y_labels = []

    # Group by patient to ensure window never crosses patient boundaries
    for pid, group in df.groupby('PatientID'):
        # Sort by time
        group = group.sort_values('DayRelativeToNearestHCT')

        values = group[feature_cols].values
        labels = group[label_col].values

        # Sliding window
        if len(values) >= seq_len:
            for i in range(len(values) - seq_len + 1):
                seq = values[i:i+seq_len]
                label_window = labels[i:i+seq_len]

                # Label Logic: If ANY point in window is dysbiotic, label=1
                # (Or use label_window[-1] for "next step prediction")
                label = int(label_window.max())

                X_sequences.append(seq)
                y_labels.append(label)

    return np.array(X_sequences), np.array(y_labels)

# Build sequences for each split independently
SEQ_LEN = 14

X_train, y_train = build_sequences(df_train, feature_cols, seq_len=SEQ_LEN)
X_val, y_val = build_sequences(df_val, feature_cols, seq_len=SEQ_LEN)
X_test, y_test = build_sequences(df_test, feature_cols, seq_len=SEQ_LEN)

print(f"Training Sequences: {X_train.shape}")
print(f"Validation Sequences: {X_val.shape}")
print(f"Testing Sequences: {X_test.shape}")

In [None]:
import tensorflow as tf
import os

# Define the path relative to the 'models' folder
model_path = "./models/DynaBiome_PatientSplit_Model.keras"

# Check if file exists (Good practice for public code)
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model file not found at {model_path}. Did you upload it to the 'models' folder?")

# Load the trained model
autoencoder = tf.keras.models.load_model(model_path)
print(f"Model loaded successfully from {model_path}")

In [None]:
# @title 6. Generate Reconstruction Errors (Features)

import numpy as np

def get_mae_features(model, X_data):
    """
    Passes data through the frozen Autoencoder and calculates MAE.
    Returns a vector of shape (n_samples, ).
    """
    # 1. Reconstruct sequences
    reconstructions = model.predict(X_data, verbose=0)

    # 2. Calculate Mean Absolute Error (averaged over time and features)
    # Axis 1 = Timesteps, Axis 2 = Features
    mae_loss = np.mean(np.abs(X_data - reconstructions), axis=(1, 2))
    return mae_loss

print("Generating features using the frozen LSTM Autoencoder...")

# 1. Generate Errors for the TRAINING set (Both Normal and Dysbiotic)
# [cite_start]We use this to TRAIN the downstream classifiers [cite: 1, 2, 3]
train_mae = get_mae_features(autoencoder, X_train)

# 2. Generate Errors for the VALIDATION set
# [cite_start]We use this to TUNE thresholds (Youden's Index) [cite: 1, 2, 3]
val_mae = get_mae_features(autoencoder, X_val)

# 3. Generate Errors for the TEST set
# [cite_start]We use this ONLY for final reporting [cite: 1, 2, 3]
test_mae = get_mae_features(autoencoder, X_test)

# Reshape for Scikit-Learn (Must be 2D array: n_samples x n_features)
X_train_feat = train_mae.reshape(-1, 1)
X_val_feat   = val_mae.reshape(-1, 1)
X_test_feat  = test_mae.reshape(-1, 1)

print(f"Feature Extraction Complete.")
print(f"Training Features Shape:   {X_train_feat.shape} (Labels: {len(y_train)})")
print(f"Validation Features Shape: {X_val_feat.shape}   (Labels: {len(y_val)})")
print(f"Test Features Shape:       {X_test_feat.shape}  (Labels: {len(y_test)})")

In [None]:
!pip install statsmodels
!pip install xgboost

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, average_precision_score, accuracy_score
import numpy as np
from sklearn.metrics import precision_recall_curve # Added for ensemble F1 tuning

# --- 1. DATA PREPARATION ---
# Ensure features are 2D arrays (N_samples, 1)
X_train_feat = train_mae.reshape(-1, 1)
X_val_feat   = val_mae.reshape(-1, 1)
X_test_feat  = test_mae.reshape(-1, 1)

print(f"Training Data Shape: {X_train_feat.shape}")

# Define list of ensemble names for later iteration
ensemble_names_list = ['Averaged Ensemble', 'Weighted Ensemble', 'Stacked (LR)', 'Stacked (XGB)']

# --- 2. DEFINE PARAMETER GRIDS ---
param_grids = {
    "Logistic Regression": {
        'model': LogisticRegression(random_state=42, solver='liblinear'),
        'params': {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']}
    },
    "KNN": {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [5, 20, 50], 'weights': ['uniform', 'distance']}
    },
    "Random Forest": {
        'model': RandomForestClassifier(random_state=42),
        'params': {'n_estimators': [100, 200], 'max_depth': [3, 5, 10], 'min_samples_split': [5, 10]}
    },
    "XGBoost": {
        'model': XGBClassifier(eval_metric='logloss', random_state=42),
        'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
    },
    "MLP": {
        'model': MLPClassifier(max_iter=500, random_state=42),
        'params': {'hidden_layer_sizes': [(32, 16), (64, 32)], 'activation': ['relu'], 'alpha': [0.001, 0.01]}
    }
}

best_models = {}

print("--- STARTING HYPERPARAMETER TUNING (GridSearchCV) ---")
print("Note: Tuning is performed on Training Set with 5-Fold CV.\n")

# --- 3. SUPERVISED MODELS LOOP ---
for name, config in param_grids.items():
    print(f"Tuning {name}...")
    grid = GridSearchCV(estimator=config['model'], param_grid=config['params'], cv=5, scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train_feat, y_train)
    best_models[name] = grid.best_estimator_
    print(f"  Best CV AUC: {grid.best_score_:.4f}")

# --- 4. ONE-CLASS SVM SPECIAL HANDLING ---
print("\nTuning One-Class SVM (Special Unsupervised Handling)...")
best_ocsvm = None
best_ocsvm_f1 = -1
X_train_normal_feat = X_train_feat[y_train == 0]

for nu in [0.01, 0.05, 0.1, 0.2, 0.3]:
    ocsvm = OneClassSVM(kernel='rbf', gamma='scale', nu=nu)
    ocsvm.fit(X_train_normal_feat)
    preds = ocsvm.predict(X_val_feat)
    binary_preds = np.where(preds == -1, 1, 0)
    f1 = f1_score(y_val, binary_preds)
    if f1 > best_ocsvm_f1:
        best_ocsvm_f1 = f1
        best_ocsvm = ocsvm

best_models["One-Class SVM"] = best_ocsvm
print(f"  Best OCSVM Val F1: {best_ocsvm_f1:.4f}")

# --- 5. COLLECT PREDICTIONS FOR ENSEMBLING & TUNING THRESHOLDS ---
# We need Validation Predictions (to train Meta-Learners and tune F1 thresholds)
# and Test Predictions (for Final Evaluation)
val_probs_dict = {}
test_probs_dict = {}
val_aucs = {} # For Weighted Ensemble
best_thresholds = {}

print("\n--- Generating Predictions for Ensembling & Tuning Threshholds ---")

for name, clf in best_models.items():
    if name == "One-Class SVM": continue # OCSVM excluded from standard ensembles for now

    # Predict on Validation
    val_probs = clf.predict_proba(X_val_feat)[:, 1]
    val_probs_dict[name] = val_probs

    # Store Val AUC for weighting
    val_aucs[name] = roc_auc_score(y_val, val_probs)

    # Threshold Tuning for Individual Models (Youden's Index for ROC)
    fpr, tpr, thresholds = roc_curve(y_val, val_probs)
    optimal_idx = np.argmax(tpr - fpr)
    # Ensure optimal_idx is within bounds of thresholds array
    if optimal_idx < len(thresholds):
        best_thresholds[name] = thresholds[optimal_idx]
    else:
        best_thresholds[name] = 0.5 # Default if threshold array is empty/problematic

    # Predict on Test
    test_probs = clf.predict_proba(X_test_feat)[:, 1]
    test_probs_dict[name] = test_probs

# ==========================================================
# --- 6. ADVANCED ENSEMBLE METHODS (CALCULATE PROBABILITIES) ---
# ==========================================================

# A. Averaged Ensemble - Calculate Test Probs
avg_probs = np.mean(list(test_probs_dict.values()), axis=0)
test_probs_dict['Averaged Ensemble'] = avg_probs # Store test probs

# B. Weighted Ensemble - Calculate Test Probs
total_auc = sum(val_aucs.values())
weights = {k: v / total_auc for k, v in val_aucs.items()} # Ensure weights sum to 1
weighted_probs = np.zeros_like(avg_probs)
for name, prob in test_probs_dict.items():
    if name in weights: # Ensure we only use models that contributed to weights
        weighted_probs += prob * weights[name]
test_probs_dict['Weighted Ensemble'] = weighted_probs # Store test probs


# Prepare Data for Stacking Meta-Learners
X_meta_train_stack = np.column_stack(list(val_probs_dict.values())) # Base model predictions on validation set
y_meta_train_stack = y_val
X_meta_test_stack = np.column_stack(list(test_probs_dict.values())[:-2]) # Base model predictions on test set (excluding current ensembles)

# C. Stacked Ensemble (Logistic Regression Meta) - Train & Calc Test Probs
meta_lr = LogisticRegression(random_state=42)
meta_lr.fit(X_meta_train_stack, y_meta_train_stack)
stack_lr_probs = meta_lr.predict_proba(X_meta_test_stack)[:, 1]
test_probs_dict['Stacked (LR)'] = stack_lr_probs # Store test probs

# D. Stacked Ensemble (XGBoost Meta) - Train & Calc Test Probs
meta_xgb = XGBClassifier(eval_metric='logloss', random_state=42)
meta_xgb.fit(X_meta_train_stack, y_meta_train_stack)
stack_xgb_probs = meta_xgb.predict_proba(X_meta_test_stack)[:, 1]
test_probs_dict['Stacked (XGB)'] = stack_xgb_probs # Store test probs

# --- E. TUNE ENSEMBLE THRESHOLDS (MAX F1 on Validation Set) ---
print("\n--- Tuning Ensemble Thresholds for F1-score (on Validation Set) ---")

# Calculate validation probabilities for ensembles for tuning
val_avg_probs = np.mean(list(val_probs_dict.values()), axis=0)
val_weighted_probs = np.zeros_like(val_avg_probs)
for name, prob in val_probs_dict.items(): # Use val_probs_dict for averaging
    if name in weights: # Ensure weights are applied correctly
        val_weighted_probs += prob * weights[name]

# Stacked meta-learners already trained on X_meta_train_stack (validation predictions)
val_stack_lr_probs = meta_lr.predict_proba(X_meta_train_stack)[:, 1]
val_stack_xgb_probs = meta_xgb.predict_proba(X_meta_train_stack)[:, 1]

ensemble_val_probs_for_tuning = {
    'Averaged Ensemble': val_avg_probs,
    'Weighted Ensemble': val_weighted_probs,
    'Stacked (LR)': val_stack_lr_probs,
    'Stacked (XGB)': val_stack_xgb_probs
}

for name in ensemble_names_list:
    probs = ensemble_val_probs_for_tuning[name]
    precision, recall, thresholds = precision_recall_curve(y_val, probs)
    # Handle division by zero for fscore calculation, especially for cases with no positive predictions
    fscore = np.divide(2 * precision * recall, precision + recall, out=np.zeros_like(precision), where=(precision + recall != 0))

    # Check if fscore array is empty or contains only NaNs (e.g., if no positive predictions or recall=0)
    if fscore.size > 0 and not np.all(np.isnan(fscore)):
        best_thresh_idx = np.nanargmax(fscore) # Use nanargmax to ignore NaNs
        # Ensure best_thresh_idx is within thresholds array bounds
        if best_thresh_idx < len(thresholds):
            best_thresholds[name] = thresholds[best_thresh_idx]
        else:
            best_thresholds[name] = 0.5 # Default if index is out of bounds
    else:
        best_thresholds[name] = 0.5 # Default threshold if no meaningful F1 can be calculated
    print(f"  {name:<20}: Optimal F1 Threshold = {best_thresholds[name]:.4f}")


# ==========================================================
# --- FINAL RESULTS TABLE (Individual Models + Ensembles) ---
# ==========================================================
print("\n--- INDIVIDUAL & ENSEMBLE MODEL RESULTS ---")
print(f"{'Model':<20} | {'ROC AUC':<10} | {'PR AUC':<10} | {'F1-Score':<10} | {'Accuracy':<10} | {'W-F1':<10} | {'M-F1':<10} | {'Optimal Threshold':<19}") # Updated header
print("-" * 118)

# Individual Models
for name in best_models.keys():
    optimal_thresh_str = "N/A"
    if name == "One-Class SVM":
        # OCSVM Logic
        scores = -best_models[name].decision_function(X_test_feat)
        test_preds = np.where(best_models[name].predict(X_test_feat) == -1, 1, 0)
        roc = roc_auc_score(y_test, scores)
        pr = average_precision_score(y_test, scores)
        f1 = f1_score(y_test, test_preds)
        acc = accuracy_score(y_test, test_preds)
        f1_w = f1_score(y_test, test_preds, average='weighted')
        f1_m = f1_score(y_test, test_preds, average='macro')
    else:
        probs = test_probs_dict[name]
        # Apply the tuned threshold to get binary predictions for F1, Accuracy
        thresh = best_thresholds.get(name, 0.5)
        binary_preds = (probs >= thresh).astype(int)
        optimal_thresh_str = f"{thresh:.4f}"

        roc = roc_auc_score(y_test, probs)
        pr = average_precision_score(y_test, probs)
        f1 = f1_score(y_test, binary_preds)
        acc = accuracy_score(y_test, binary_preds)
        f1_w = f1_score(y_test, binary_preds, average='weighted')
        f1_m = f1_score(y_test, binary_preds, average='macro')

    print(f"{name:<20} | {roc:.4f}     | {pr:.4f}     | {f1:.4f}     | {acc:.4f}     | {f1_w:.4f}     | {f1_m:.4f}     | {optimal_thresh_str:<19}")

# Ensembles
for name in ensemble_names_list:
    probs = test_probs_dict[name]
    # Apply the tuned F1 threshold for ensembles to get binary predictions
    thresh = best_thresholds.get(name, 0.5)
    binary_preds = (probs >= thresh).astype(int)
    optimal_thresh_str = f"{thresh:.4f}"

    roc = roc_auc_score(y_test, probs)
    pr = average_precision_score(y_test, probs)
    f1 = f1_score(y_test, binary_preds)
    acc = accuracy_score(y_test, binary_preds)
    f1_w = f1_score(y_test, binary_preds, average='weighted')
    f1_m = f1_score(y_test, binary_preds, average='macro')

    print(f"{name:<20} | {roc:.4f}     | {pr:.4f}     | {f1:.4f}     | {acc:.4f}     | {f1_w:.4f}     | {f1_m:.4f}     | {optimal_thresh_str:<19}")

print("-" * 118)

In [None]:
# @title 9. Generate Final ROC & PR Curves (All Models + Advanced Ensembles)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score

# Setup Plot (1 Row, 2 Columns)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 9)) # Slightly larger figure for better spacing
plt.rcParams.update({'font.size': 14, 'font.family': 'serif'}) # Increased overall font size

# --- DEFINE COLORS & STYLES ---
colors = {
    'Logistic Regression': '#1f77b4',  # Blue
    'Random Forest':       '#ff7f0e',  # Orange
    'XGBoost':             '#2ca02c',  # Green
    'MLP':                 '#d62728',  # Red
    'KNN':                 '#9467bd',  # Purple
    'One-Class SVM':       '#e377c2',  # Pink
    'Averaged Ensemble':   'black',
    'Weighted Ensemble':   '#8c564b',  # Brown
    'Stacked (LR)':        '#7f7f7f',  # Gray
    'Stacked (XGB)':       '#bcbd22'   # Olive
}

# Styles for differentiating Ensembles
styles = {
    'Averaged Ensemble': '-',
    'Weighted Ensemble': '--',
    'Stacked (LR)':      '-.',
    'Stacked (XGB)':     ':'
}

# ==========================================
# PLOT 1: ROC CURVES (Left Panel)
# ==========================================

# 1. Baseline (LSTM-AE)
fpr_ae, tpr_ae, _ = roc_curve(y_test, test_mae)
auc_ae = roc_auc_score(y_test, test_mae)
ax1.plot(fpr_ae, tpr_ae, label=f'LSTM-AE Baseline (AUC = {auc_ae:.3f})',
         linestyle='--', color='gray', linewidth=2.5, alpha=0.7) # Increased linewidth

# 2. Individual Tuned Classifiers
for name, clf in best_models.items():
    if name == "One-Class SVM":
        # OCSVM Logic
        scores = -clf.decision_function(X_test_feat)
        fpr, tpr, _ = roc_curve(y_test, scores)
        roc_val = roc_auc_score(y_test, scores)
        ax1.plot(fpr, tpr, label=f'{name} (AUC = {roc_val:.3f})',
                 color=colors.get(name, 'black'), linewidth=1.5, linestyle=':', alpha=0.6) # Increased linewidth
        continue

    # Standard Models
    probs = clf.predict_proba(X_test_feat)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, probs)
    roc_val = roc_auc_score(y_test, probs)

    # Highlight RF/XGB slightly
    lw = 2.5 if name in ['Random Forest', 'XGBoost'] else 1.5 # Increased linewidth
    ax1.plot(fpr, tpr, label=f'{name} (AUC = {roc_val:.3f})',
             color=colors.get(name, 'black'), linewidth=lw, alpha=0.8)

# 3. Advanced Ensembles (From test_probs_dict)
ensemble_names = ['Averaged Ensemble', 'Weighted Ensemble', 'Stacked (LR)', 'Stacked (XGB)']

for name in ensemble_names:
    if name in test_probs_dict:
        probs = test_probs_dict[name]
        fpr, tpr, _ = roc_curve(y_test, probs)
        roc_val = roc_auc_score(y_test, probs)

        # Make Averaged Ensemble thickest
        lw = 4 if name == 'Averaged Ensemble' else 3 # Increased linewidth
        ax1.plot(fpr, tpr, label=f'{name} (AUC = {roc_val:.3f})',
                 color=colors.get(name, 'black'), linestyle=styles.get(name, '-'),
                 linewidth=lw, zorder=10)

# Formatting ROC
ax1.plot([0, 1], [0, 1], 'k:', alpha=0.4)
ax1.set_xlim([0.0, 1.0])
ax1.set_ylim([0.0, 1.02])
ax1.set_xlabel('False Positive Rate (1 - Specificity)', fontweight='bold')
ax1.set_ylabel('True Positive Rate (Sensitivity)', fontweight='bold')
ax1.set_title('Receiver Operating Characteristic (ROC)', fontweight='bold', pad=10)
ax1.legend(loc="lower right", fontsize=11, ncol=1) # Increased legend fontsize
ax1.grid(True, alpha=0.3)


# ==========================================
# PLOT 2: PRECISION-RECALL CURVES (Right Panel)
# ==========================================

# 1. Baseline
precision_ae, recall_ae, _ = precision_recall_curve(y_test, test_mae)
pr_auc_ae = average_precision_score(y_test, test_mae)
ax2.plot(recall_ae, precision_ae, label=f'LSTM-AE Baseline (AP = {pr_auc_ae:.3f})',
         linestyle='--', color='gray', linewidth=2.5, alpha=0.7) # Increased linewidth

# 2. Individual Classifiers
for name, clf in best_models.items():
    if name == "One-Class SVM":
        scores = -clf.decision_function(X_test_feat)
        precision, recall, _ = precision_recall_curve(y_test, scores)
        pr_val = average_precision_score(y_test, scores)
        ax2.plot(recall, precision, label=f'{name} (AP = {pr_val:.3f})',
                 color=colors.get(name, 'black'), linewidth=1.5, linestyle=':', alpha=0.6) # Increased linewidth
        continue

    probs = clf.predict_proba(X_test_feat)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, probs)
    pr_val = average_precision_score(y_test, probs)

    lw = 2.5 if name in ['Random Forest', 'XGBoost'] else 1.5 # Increased linewidth
    ax2.plot(recall, precision, label=f'{name} (AP = {pr_val:.3f})',
             color=colors.get(name, 'black'), linewidth=lw, alpha=0.8)

# 3. Advanced Ensembles
for name in ensemble_names:
    if name in test_probs_dict:
        probs = test_probs_dict[name]
        precision, recall, _ = precision_recall_curve(y_test, probs)
        pr_val = average_precision_score(y_test, probs)

        lw = 4 if name == 'Averaged Ensemble' else 3 # Increased linewidth
        ax2.plot(recall, precision, label=f'{name} (AP = {pr_val:.3f})',
                 color=colors.get(name, 'black'), linestyle=styles.get(name, '-'),
                 linewidth=lw, zorder=10)

# Formatting PR
ax2.set_xlim([0.0, 1.0])
ax2.set_ylim([0.0, 1.02])
ax2.set_xlabel('Recall (Sensitivity)', fontweight='bold')
ax2.set_ylabel('Precision (Positive Predictive Value)', fontweight='bold')
ax2.set_title('Precision-Recall (PR) Curves', fontweight='bold', pad=10)
ax2.legend(loc="lower left", fontsize=11, ncol=1) # Increased legend fontsize
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('Figure3b_Complete_Ensemble_Analysis.pdf', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
# @title 16. Benchmark Evaluation: Isolation Forest & LOF (Unsupervised)
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

print("--- RE-EVALUATING UNSUPERVISED BENCHMARKS (On Patient-Split Data) ---")

# 1. PREPARE DATA (Flatten Time Dimension)
# LSTM uses (N, 14, Features). IF/LOF need (N, 14*Features).
n_samples_train, n_steps, n_feats = X_train.shape
n_samples_test = X_test.shape[0]

# Flatten: Each sample becomes a long vector of 14*Features
X_train_flat = X_train.reshape(n_samples_train, n_steps * n_feats)
X_test_flat = X_test.reshape(n_samples_test, n_steps * n_feats)

# Filter Normal Training Data for Isolation Forest (Standard Anomaly Detection setup)
X_train_flat_normal = X_train_flat[y_train == 0]

print(f"Flattened Train Shape: {X_train_flat.shape}")
print(f"Flattened Test Shape:  {X_test_flat.shape}")

# Store results
benchmark_results = []

# ==========================================
# A. ISOLATION FOREST
# ==========================================
print("\nRunning Isolation Forest...")
iso_forest = IsolationForest(n_estimators=100, contamination='auto', random_state=42, n_jobs=-1)

# Train on Normal Data Only
iso_forest.fit(X_train_flat_normal)

# Predict (Returns -1 for outlier, 1 for inlier)
iso_scores = -iso_forest.decision_function(X_test_flat)

# Calculate Metrics
iso_roc = roc_auc_score(y_test, iso_scores)
iso_pr = average_precision_score(y_test, iso_scores)

# Map predictions: -1 -> 1 (Anomaly), 1 -> 0 (Normal)
iso_preds_raw = iso_forest.predict(X_test_flat)
iso_preds = np.where(iso_preds_raw == -1, 1, 0)

# EXTENDED METRICS
iso_acc = accuracy_score(y_test, iso_preds)
iso_f1_macro = f1_score(y_test, iso_preds, average='macro')
iso_f1_weighted = f1_score(y_test, iso_preds, average='weighted')

print(f"Isolation Forest | ROC: {iso_roc:.4f} | PR: {iso_pr:.4f} | Acc: {iso_acc:.4f}")
benchmark_results.append(['Isolation Forest', iso_acc, iso_f1_macro, iso_f1_weighted, iso_roc, iso_pr])


# ==========================================
# B. LOCAL OUTLIER FACTOR (LOF)
# ==========================================
print("Running Local Outlier Factor (LOF)...")
lof = LocalOutlierFactor(n_neighbors=20, novelty=True, n_jobs=-1)

# Train on Normal Data
lof.fit(X_train_flat_normal)

# Predict
lof_scores = -lof.decision_function(X_test_flat)

# Metrics
lof_roc = roc_auc_score(y_test, lof_scores)
lof_pr = average_precision_score(y_test, lof_scores)

# Map predictions
lof_preds_raw = lof.predict(X_test_flat)
lof_preds = np.where(lof_preds_raw == -1, 1, 0)

# EXTENDED METRICS
lof_acc = accuracy_score(y_test, lof_preds)
lof_f1_macro = f1_score(y_test, lof_preds, average='macro')
lof_f1_weighted = f1_score(y_test, lof_preds, average='weighted')

print(f"Local Outlier Factor | ROC: {lof_roc:.4f} | PR: {lof_pr:.4f} | Acc: {lof_acc:.4f}")
benchmark_results.append(['Local Outlier Factor', lof_acc, lof_f1_macro, lof_f1_weighted, lof_roc, lof_pr])

# ==========================================
# COMPARE WITH DYNABIOME (Stacked LR)
# ==========================================
try:
    # Retrieve scores and threshold
    dyna_roc = roc_auc_score(y_test, test_probs_dict['Stacked (LR)'])
    dyna_pr = average_precision_score(y_test, test_probs_dict['Stacked (LR)'])

    dyna_thresh = best_thresholds['Stacked (LR)']
    dyna_preds = (test_probs_dict['Stacked (LR)'] >= dyna_thresh).astype(int)

    # EXTENDED METRICS
    dyna_acc = accuracy_score(y_test, dyna_preds)
    dyna_f1_macro = f1_score(y_test, dyna_preds, average='macro')
    dyna_f1_weighted = f1_score(y_test, dyna_preds, average='weighted')

    benchmark_results.append(['DynaBiome (Stacked LR)', dyna_acc, dyna_f1_macro, dyna_f1_weighted, dyna_roc, dyna_pr])

except (NameError, KeyError) as e:
    print(f"DynaBiome scores not found (skipping comparison row): {e}")

# Print Summary Table
df_bench = pd.DataFrame(benchmark_results, columns=['Model', 'Accuracy', 'F1 (Macro)', 'F1 (Weighted)', 'ROC AUC', 'PR AUC'])
print("\n--- FINAL BENCHMARK COMPARISON ---")
print(df_bench)

In [None]:
# @title 17. Plot Benchmark Comparison
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data from your results
data = {
    'Model': ['Isolation Forest', 'LOF', 'DynaBiome (Proposed)'],
    'ROC AUC': [0.8555, 0.8407, 0.8908],
    'PR AUC':  [0.5877, 0.5710, 0.6479]
}
df_plot = pd.DataFrame(data)

# Melt for Seaborn
df_melted = df_plot.melt(id_vars='Model', var_name='Metric', value_name='Score')

# Plot
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
# Custom palette: Grey for baselines, Red for DynaBiome to make it pop
ax = sns.barplot(x='Metric', y='Score', hue='Model', data=df_melted,
                 palette=['#95a5a6', '#7f8c8d', '#d62728'])

# Annotate bars
for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', padding=3, fontsize=10)

plt.title('Benchmarking Static vs. Temporal Anomaly Detection', fontsize=14, pad=20)
plt.ylim(0.5, 1.0)  # Zoom in to show differences
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('Figure_5_Benchmarks.pdf', dpi=600)
plt.show()

In [None]:
# @title 17. Generate Benchmark Comparison Figure (ROC & PR Curves)
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve, auc

# Setup Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
plt.rcParams.update({'font.size': 11, 'font.family': 'serif'})

# Colors & Styles
colors = {
    'Isolation Forest': '#7f7f7f',  # Gray
    'Local Outlier Factor': '#c7c7c7', # Light Gray
    'DynaBiome (Stacked LR)': 'black'   # Bold Black for your model
}
styles = {
    'Isolation Forest': '--',
    'Local Outlier Factor': ':',
    'DynaBiome (Stacked LR)': '-'
}
widths = {
    'Isolation Forest': 2,
    'Local Outlier Factor': 2,
    'DynaBiome (Stacked LR)': 3
}

# Define models to plot
# Key: (Scores, Label)
models_to_plot = {
    'Isolation Forest': iso_scores,
    'Local Outlier Factor': lof_scores,
    'DynaBiome (Stacked LR)': test_probs_dict['Stacked (LR)']
}

# ==========================================
# PLOT 1: ROC CURVES (Left)
# ==========================================
for name, scores in models_to_plot.items():
    fpr, tpr, _ = roc_curve(y_test, scores)
    roc_val = auc(fpr, tpr)

    ax1.plot(fpr, tpr, label=f'{name} (AUC = {roc_val:.3f})',
             color=colors[name], linestyle=styles[name],
             linewidth=widths[name], alpha=0.8)

ax1.plot([0, 1], [0, 1], 'k:', alpha=0.3)
ax1.set_xlim([0.0, 1.0])
ax1.set_ylim([0.0, 1.02])
ax1.set_xlabel('False Positive Rate', fontweight='bold')
ax1.set_ylabel('True Positive Rate', fontweight='bold')
ax1.set_title('ROC Curves: Proposed vs. Benchmarks', fontweight='bold')
ax1.legend(loc="lower right", fontsize=10)
ax1.grid(True, alpha=0.3)

# ==========================================
# PLOT 2: PRECISION-RECALL CURVES (Right)
# ==========================================
for name, scores in models_to_plot.items():
    precision, recall, _ = precision_recall_curve(y_test, scores)
    pr_val = auc(recall, precision)

    ax2.plot(recall, precision, label=f'{name} (AP = {pr_val:.3f})',
             color=colors[name], linestyle=styles[name],
             linewidth=widths[name], alpha=0.8)

ax2.set_xlim([0.0, 1.0])
ax2.set_ylim([0.0, 1.02])
ax2.set_xlabel('Recall', fontweight='bold')
ax2.set_ylabel('Precision', fontweight='bold')
ax2.set_title('Precision-Recall Curves: Proposed vs. Benchmarks', fontweight='bold')
ax2.legend(loc="lower left", fontsize=10)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('Figure_Benchmark_Comparison.pdf', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
# @title 18. Train State-of-the-Art Baseline: CNN-LSTM with Self-Knowledge Distillation
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, LSTM, Dense,
                                     Dropout, GlobalAveragePooling1D, Lambda)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
from sklearn.utils import class_weight # Import class_weight

# Clear session to avoid clutter
tf.keras.backend.clear_session()

# ----- Custom Layer to compute KL divergence and attach loss -----
class KLDivergenceLayer(tf.keras.layers.Layer):
    def __init__(self, weight=0.1, **kwargs):
        super(KLDivergenceLayer, self).__init__(**kwargs)
        self.weight = weight

    def call(self, inputs):
        main_pred, branch_pred = inputs
        # Clip to avoid log(0) errors
        epsilon = 1e-7
        main_pred = tf.clip_by_value(main_pred, epsilon, 1.0)
        branch_pred = tf.clip_by_value(branch_pred, epsilon, 1.0)

        # Calculate KL Divergence
        kl_loss = tf.reduce_mean(tf.keras.losses.kld(main_pred, branch_pred))
        self.add_loss(self.weight * kl_loss)
        return inputs

# ----- Setup Input Shapes based on your Sequence Data -----
timesteps = X_train.shape[1]   # Should be 14 (weeks)
n_features = X_train.shape[2]  # Should be ~118 (bacteria)

# ----- Model Definition -----
inp = Input(shape=(timesteps, n_features), name='input_layer')

# --- CNN Module ---
cnn_out = Conv1D(filters=64,
                 kernel_size=3,
                 activation='relu',
                 padding='same',
                 name='conv1d')(inp)

# --- Self-Distillation Branch (Shallow Classifier) ---
branch_features = GlobalAveragePooling1D(name='global_avg_pool')(cnn_out)
branch_output = Dense(1, activation='sigmoid', name='branch_output')(branch_features)

# --- Main Pipeline (CNN + LSTM) ---
pool_out = MaxPooling1D(pool_size=2, padding='same', name='max_pool')(cnn_out)
lstm_out = LSTM(100, return_sequences=False, name='lstm')(pool_out)
dense_out = Dense(64, activation='relu', name='dense')(lstm_out)
drop = Dropout(0.3, name='dropout')(dense_out)
main_output = Dense(1, activation='sigmoid', name='main_output')(drop)

# --- Attach KL Divergence Loss ---
_ = KLDivergenceLayer(weight=0.1, name='kl_layer')([main_output, branch_output])

# Build Model
model_skd = Model(inputs=inp, outputs=[main_output, branch_output], name='CNN_LSTM_SKD')

# ----- Compile -----
# Using dictionary to target specific outputs
model_skd.compile(
    optimizer=Adam(learning_rate=0.001),
    loss={'main_output': 'binary_crossentropy', 'branch_output': 'binary_crossentropy'},
    loss_weights={'main_output': 1.0, 'branch_output': 0.5}, # Branch is an auxiliary loss
    metrics={'main_output': 'accuracy', 'branch_output': 'accuracy'}
)

model_skd.summary()

# ----- Training -----
# Calculate class weights for imbalanced dataset
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}

# Create dictionary targets for multi-output
y_train_targets = {'main_output': y_train, 'branch_output': y_train}
y_val_targets = {'main_output': y_val, 'branch_output': y_val}

# Added mode='min' to EarlyStopping for loss monitoring
early_stopping = EarlyStopping(monitor='val_main_output_loss', patience=10, restore_best_weights=True, mode='min')

print("\n--- Training CNN-LSTM SKD Model ---")
history_skd = model_skd.fit(
    X_train,
    y_train_targets,
    validation_data=(X_val, y_val_targets),
    epochs=100, # Increased epochs slightly as early stopping will handle it
    batch_size=64,
    callbacks=[early_stopping],
    # We apply the class weights calculated earlier to handle imbalance
    # Removed class_weight=class_weights_dict because it's not supported for multi-output models
    verbose=1
)

# ----- Evaluation on Test Set -----
print("\n--- Evaluating on Test Set ---")
# Predict returns a list: [main_predictions, branch_predictions]
preds_skd = model_skd.predict(X_test)
main_preds = preds_skd[0] # We only care about the Main branch for final evaluation

# Calculate Metrics
roc_val = roc_auc_score(y_test, main_preds)
pr_val = average_precision_score(y_test, main_preds)
preds_binary = (main_preds > 0.5).astype(int)
acc = accuracy_score(y_test, preds_binary)
f1_macro = f1_score(y_test, preds_binary, average='macro')
f1_weighted = f1_score(y_test, preds_binary, average='weighted')

print("-" * 30)
print(f"CNN-LSTM (SKD) Results:")
print(f"ROC AUC:       {roc_val:.4f}")
print(f"PR AUC:        {pr_val:.4f}")
print(f"Accuracy:      {acc:.4f}")
print(f"F1 (Macro):    {f1_macro:.4f}")
print(f"F1 (Weighted): {f1_weighted:.4f}")
print("-" * 30)


In [None]:
# @title Generate Final ROC & PR Curves (All Models + Advanced Ensembles)

import matplotlib.pyplot as plt

import numpy as np

from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score



# Setup Plot (1 Row, 2 Columns)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 9)) # Slightly larger figure for better spacing

plt.rcParams.update({'font.size': 14, 'font.family': 'serif'}) # Increased overall font size



# --- DEFINE COLORS & STYLES ---

colors = {

    'LSTM-AE Baseline':    'gray',

    'Logistic Regression': '#1f77b4',  # Blue

    'Random Forest':       '#ff7f0e',  # Orange

    'XGBoost':             '#2ca02c',  # Green

    'MLP':                 '#d62728',  # Red

    'KNN':                 '#9467bd',  # Purple

    'One-Class SVM':       '#e377c2',  # Pink

    'Averaged Ensemble':   'black',

    'Weighted Ensemble':   '#8c564b',  # Brown

    'Stacked (LR)':        '#7f7f7f',  # Gray

    'Stacked (XGB)':       '#bcbd22',

    'CNN-LSTM (SKD)':      '#17becf'   # Teal for CNN-LSTM SKD

}



# Styles for differentiating Ensembles and SKD

styles = {

    'LSTM-AE Baseline':    '--',

    'Logistic Regression': '-',

    'Random Forest':       '-',

    'XGBoost':             '-',

    'MLP':                 '-',

    'KNN':                 '-',

    'One-Class SVM':       ':',

    'Averaged Ensemble':   '-',

    'Weighted Ensemble':   '--',

    'Stacked (LR)':        '-.',

    'Stacked (XGB)':       ':',

    'CNN-LSTM (SKD)':      '-.' # Dotted line for CNN-LSTM SKD

}



# Line widths for highlighting

line_widths = {

    'LSTM-AE Baseline':    2.5,

    'Averaged Ensemble':   4,

    'CNN-LSTM (SKD)':      3.5

}



# Add CNN-LSTM (SKD) predictions to test_probs_dict

# 'main_preds' and 'y_test' are from the executed cell 'BBJ6qeBB5Upn'

roc_skd = roc_auc_score(y_test, main_preds)

pr_skd = average_precision_score(y_test, main_preds)

test_probs_dict['CNN-LSTM (SKD)'] = main_preds.flatten()



# Compile all models to plot

all_models_to_plot = [

    'LSTM-AE Baseline'

] + list(best_models.keys()) + [

    'Averaged Ensemble', 'Weighted Ensemble', 'Stacked (LR)', 'Stacked (XGB)',

    'CNN-LSTM (SKD)' # Add CNN-LSTM SKD here

]



# ==========================================

# PLOT 1: ROC CURVES (Left Panel)

# ==========================================



for name in all_models_to_plot:

    if name == 'LSTM-AE Baseline':

        fpr, tpr, _ = roc_curve(y_test, test_mae)

        auc_val = roc_auc_score(y_test, test_mae)

    elif name == 'One-Class SVM':

        scores = -best_models[name].decision_function(X_test_feat)

        fpr, tpr, _ = roc_curve(y_test, scores)

        auc_val = roc_auc_score(y_test, scores)

    elif name == 'CNN-LSTM (SKD)':

        fpr, tpr, _ = roc_curve(y_test, test_probs_dict[name])

        auc_val = roc_auc_score(y_test, test_probs_dict[name])

    elif name in best_models:

        probs = best_models[name].predict_proba(X_test_feat)[:, 1]

        fpr, tpr, _ = roc_curve(y_test, probs)

        auc_val = roc_auc_score(y_test, probs)

    else: # Ensembles (Averaged, Weighted, Stacked) which are in test_probs_dict

        probs = test_probs_dict[name]

        fpr, tpr, _ = roc_curve(y_test, probs)

        auc_val = roc_auc_score(y_test, probs)



    lw = line_widths.get(name, 1.5) # Default linewidth

    ax1.plot(fpr, tpr, label=f'{name} (AUC = {auc_val:.3f})',

             color=colors.get(name, 'black'), linestyle=styles.get(name, '-'),

             linewidth=lw, alpha=0.8)



# Formatting ROC

ax1.plot([0, 1], [0, 1], 'k:', alpha=0.4)

ax1.set_xlim([0.0, 1.0])

ax1.set_ylim([0.0, 1.02])

ax1.set_xlabel('False Positive Rate (1 - Specificity)', fontweight='bold')

ax1.set_ylabel('True Positive Rate (Sensitivity)', fontweight='bold')

ax1.set_title('Receiver Operating Characteristic (ROC)', fontweight='bold', pad=10)

ax1.legend(loc="lower right", fontsize=11, ncol=1)

ax1.grid(True, alpha=0.3)





# ==========================================

# PLOT 2: PRECISION-RECALL CURVES (Right Panel)

# ==========================================



for name in all_models_to_plot:

    if name == 'LSTM-AE Baseline':

        precision, recall, _ = precision_recall_curve(y_test, test_mae)

        pr_auc_val = average_precision_score(y_test, test_mae)

    elif name == 'One-Class SVM':

        scores = -best_models[name].decision_function(X_test_feat)

        precision, recall, _ = precision_recall_curve(y_test, scores)

        pr_auc_val = average_precision_score(y_test, scores)

    elif name == 'CNN-LSTM (SKD)':

        precision, recall, _ = precision_recall_curve(y_test, test_probs_dict[name])

        pr_auc_val = average_precision_score(y_test, test_probs_dict[name])

    elif name in best_models:

        probs = best_models[name].predict_proba(X_test_feat)[:, 1]

        precision, recall, _ = precision_recall_curve(y_test, probs)

        pr_auc_val = average_precision_score(y_test, probs)

    else: # Ensembles

        probs = test_probs_dict[name]

        precision, recall, _ = precision_recall_curve(y_test, probs)

        pr_auc_val = average_precision_score(y_test, probs)



    lw = line_widths.get(name, 1.5)

    ax2.plot(recall, precision, label=f'{name} (AP = {pr_auc_val:.3f})',

             color=colors.get(name, 'black'), linestyle=styles.get(name, '-'),

             linewidth=lw, alpha=0.8)



# Formatting PR

ax2.set_xlim([0.0, 1.0])

ax2.set_ylim([0.0, 1.02])

ax2.set_xlabel('Recall (Sensitivity)', fontweight='bold')

ax2.set_ylabel('Precision (Positive Predictive Value)', fontweight='bold')

ax2.set_title('Precision-Recall (PR) Curves', fontweight='bold', pad=10)

ax2.legend(loc="lower left", fontsize=11, ncol=1)

ax2.grid(True, alpha=0.3)



plt.tight_layout()

plt.savefig('Figure_Combined_ROCPR_Analysis.pdf', dpi=600, bbox_inches='tight')

plt.show()

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import regularizers, backend as K
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score
import pandas as pd

# Clear session
tf.keras.backend.clear_session()

print("--- BENCHMARKING: Sparse AE + LSTM & SelectKBest + LSTM ---")

# --- CONFIGURATION (Matched to your notebook) ---
n_timesteps = X_train.shape[1]
n_features = X_train.shape[2]
encoding_dim = 64  # Size of sparse layer
k_best = 50        # Number of features for SelectKBest
rho = 0.01         # Sparsity parameter
beta = 3.0         # Sparsity weight

# Common Early Stopping
es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# ==============================================================================
# 1. Sparse Autoencoder + LSTM
# ==============================================================================
print("\n--- 1. Running Sparse Autoencoder + LSTM ---")

# Custom KL Divergence Regularizer
class KLDivergenceRegularizer(regularizers.Regularizer):
    def __init__(self, rho=0.01, beta=3.0):
        self.rho = rho
        self.beta = beta

    def __call__(self, x):
        rho_hat = K.mean(x, axis=0)
        epsilon = 1e-6
        rho_hat = K.clip(rho_hat, epsilon, 1.0 - epsilon)
        loss = self.rho * K.log(self.rho / rho_hat) + \
               (1 - self.rho) * K.log((1 - self.rho) / (1 - rho_hat))
        return self.beta * K.sum(loss)

    def get_config(self):
        return {'rho': self.rho, 'beta': self.beta}

# A. Prepare Flattened Data for AE
X_train_flat = X_train.reshape(-1, n_features)
X_val_flat = X_val.reshape(-1, n_features)

# B. Build & Train Sparse AE
input_layer = Input(shape=(n_features,))
sparse_encoded = Dense(encoding_dim, activation='relu',
                       activity_regularizer=KLDivergenceRegularizer(rho, beta),
                       name='sparse_layer')(input_layer)
decoded = Dense(n_features, activation='sigmoid')(sparse_encoded)

autoencoder_sparse = Model(input_layer, decoded)
autoencoder_sparse.compile(optimizer='adam', loss='mse')

print("Training Sparse Autoencoder...")
autoencoder_sparse.fit(X_train_flat, X_train_flat,
                       epochs=50, batch_size=256,
                       validation_data=(X_val_flat, X_val_flat),
                       callbacks=[es], verbose=0)

# C. Extract Features
encoder_model = Model(inputs=autoencoder_sparse.input, outputs=autoencoder_sparse.get_layer('sparse_layer').output)

def encode_sequence(data, enc_model):
    N, T, F = data.shape
    flat = data.reshape(-1, F)
    encoded_flat = enc_model.predict(flat, verbose=0)
    return encoded_flat.reshape(N, T, -1)

X_train_enc = encode_sequence(X_train, encoder_model)
X_val_enc = encode_sequence(X_val, encoder_model)
X_test_enc = encode_sequence(X_test, encoder_model)

# D. Train LSTM on Sparse Features
lstm_input = Input(shape=(n_timesteps, encoding_dim))
l = LSTM(64, return_sequences=False)(lstm_input)
l = Dense(32, activation='relu')(l)
l = Dropout(0.3)(l)
out = Dense(1, activation='sigmoid')(l)

lstm_sparse = Model(lstm_input, out)
lstm_sparse.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("Training LSTM on Sparse Features...")
lstm_sparse.fit(X_train_enc, y_train,
                validation_data=(X_val_enc, y_val),
                epochs=50, batch_size=64,
                callbacks=[es], class_weight=class_weights_dict, verbose=0)

# Evaluate
pred_sparse = lstm_sparse.predict(X_test_enc, verbose=0).ravel()
roc_sparse = roc_auc_score(y_test, pred_sparse)
pr_sparse = average_precision_score(y_test, pred_sparse)
f1_sparse = f1_score(y_test, (pred_sparse > 0.5).astype(int), average='macro')
f1_sparse_weighted = f1_score(y_test, (pred_sparse > 0.5).astype(int), average='weighted')
acc_sparse = accuracy_score(y_test, (pred_sparse > 0.5).astype(int))

print(f"Result: ROC={roc_sparse:.4f} | PR={pr_sparse:.4f}")


# ==============================================================================
# 2. SelectKBest + LSTM
# ==============================================================================
print("\n--- 2. Running SelectKBest + LSTM ---")

# A. Feature Selection (on Time-Averaged Data)
X_train_avg = np.mean(X_train, axis=1)
selector = SelectKBest(score_func=mutual_info_classif, k=k_best)
selector.fit(X_train_avg, y_train)
selected_indices = selector.get_support(indices=True)

# B. Subset Data
X_train_sel = X_train[:, :, selected_indices]
X_val_sel = X_val[:, :, selected_indices]
X_test_sel = X_test[:, :, selected_indices]

# C. Train LSTM on Selected Features
lstm_sel_input = Input(shape=(n_timesteps, k_best))
l2 = LSTM(64, return_sequences=False)(lstm_sel_input)
l2 = Dense(32, activation='relu')(l2)
l2 = Dropout(0.3)(l2)
out2 = Dense(1, activation='sigmoid')(l2)

lstm_select = Model(lstm_sel_input, out2)
lstm_select.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("Training LSTM on SelectKBest Features...")
lstm_select.fit(X_train_sel, y_train,
                validation_data=(X_val_sel, y_val),
                epochs=50, batch_size=64,
                callbacks=[es], class_weight=class_weights_dict, verbose=0)

# Evaluate
pred_sel = lstm_select.predict(X_test_sel, verbose=0).ravel()
roc_sel = roc_auc_score(y_test, pred_sel)
pr_sel = average_precision_score(y_test, pred_sel)
f1_sel = f1_score(y_test, (pred_sel > 0.5).astype(int), average='macro')
f1_sel_weighted = f1_score(y_test, (pred_sel > 0.5).astype(int), average='weighted')
acc_sel = accuracy_score(y_test, (pred_sel > 0.5).astype(int))

print(f"Result: ROC={roc_sel:.4f} | PR={pr_sel:.4f}")

# ==============================================================================
# 3. Final Comparison Table
# ==============================================================================
results_data = [
    ['Sparse AE + LSTM', roc_sparse, pr_sparse, acc_sparse, f1_sparse, f1_sparse_weighted],
    ['SelectKBest + LSTM', roc_sel, pr_sel, acc_sel, f1_sel, f1_sel_weighted]
]
df_feat_bench = pd.DataFrame(results_data, columns=['Model', 'ROC AUC', 'PR AUC', 'Accuracy', 'F1 (Macro)', 'F1 (Weighted)'])
print("\n--- FEATURE SELECTION BENCHMARKS ---")
print(df_feat_bench.to_string())

In [None]:
test_probs_dict['Sparse AE + LSTM'] = pred_sparse
test_probs_dict['SelectKBest + LSTM'] = pred_sel

# Add Isolation Forest and LOF scores, inverting them as anomaly detection models
# higher scores indicate higher anomaly, which aligns with higher probability of dysbiosis (label 1)
test_probs_dict['Isolation Forest'] = iso_scores
test_probs_dict['Local Outlier Factor'] = lof_scores

In [None]:
# @title 15. Feature Importance Analysis (Reconstruction Attribution)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1. CALCULATE ERROR PER FEATURE ---
# Reconstruct the Test Set
reconstructions = autoencoder.predict(X_test, verbose=0)

# Calculate MAE for each feature (averaged over time only)
# Result Shape: (n_samples, n_features)
mae_per_feature = np.mean(np.abs(X_test - reconstructions), axis=1)

# --- 2. IDENTIFY DYSBIOTIC SAMPLES ---
# We focus on samples that were TRULY Dysbiotic (y_test == 1)
# to see what characterizes the disease state.
dysbiosis_indices = np.where(y_test == 1)[0]
dysbiosis_errors = mae_per_feature[dysbiosis_indices]

# Calculate Mean Error per feature across all Dysbiotic samples
mean_error_per_feature = np.mean(dysbiosis_errors, axis=0)

# --- 3. MAP TO NAMES ---
# Create a DataFrame
# Note: 'feature_cols' comes from your Step 3 (Patient Split) block
importance_df = pd.DataFrame({
    'Feature': feature_cols,
    'Mean_MAE': mean_error_per_feature
})

# Sort by Error (High Error = High Importance)
importance_df = importance_df.sort_values(by='Mean_MAE', ascending=False).head(20)

# --- 4. PLOT ---
plt.figure(figsize=(10, 8))
plt.rcParams.update({'font.size': 12, 'font.family': 'serif'})

sns.barplot(data=importance_df, x='Mean_MAE', y='Feature', palette='viridis')

plt.title('Top 20 Bacteria Driving Dysbiosis Classification\n(Based on Reconstruction Error Contribution)', fontweight='bold')
plt.xlabel('Mean Reconstruction Error (MAE)', fontweight='bold')
plt.ylabel('Bacterial Genus', fontweight='bold')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('Figure4_Feature_Importance.pdf', dpi=600)
plt.show()

print("Top 5 Drivers:")
print(importance_df[['Feature', 'Mean_MAE']].head(5))

In [None]:
!pip install shap
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ==========================================
# 1. SETUP: Define the "Score Function"
# ==========================================
# SHAP needs a function that takes data -> outputs a single number (Anomaly Score)
# We calculate the Mean Squared Error (MSE) for each sample.

def model_loss_function(data):
    # data shape: (samples, features) or (samples, timesteps, features)

    # If your model expects 3D input (LSTM) but SHAP passes 2D, reshape it:
    timesteps = SEQ_LEN  # Use the globally defined sequence length
    num_features = data.shape[1] // timesteps

    # Reshape for the model
    data_3d = data.reshape(-1, timesteps, num_features)

    # Get reconstruction from the autoencoder
    reconstruction = autoencoder.predict(data_3d, verbose=0)

    # Calculate MSE (Anomaly Score) per sample
    # We flatten to calculate total error per sample
    mse = np.mean(np.square(data_3d - reconstruction), axis=(1,2))
    return mse

# ==========================================
# 2. PREPARE DATA FOR SHAP
# ==========================================
# SHAP creates a "background" to compare against.
# We use a summary of the training set (Normal data) to speed this up.

# Flatten your 3D X_train and X_test to 2D for SHAP (samples, timesteps*features)
X_train_flat = X_train.reshape(X_train.shape[0], -1) # Corrected variable name
X_test_flat = X_test.reshape(X_test.shape[0], -1)   # Corrected variable name

# Use a random subset of background data (e.g., 100 samples)
background = shap.sample(X_train_flat, 50)

# ==========================================
# 3. RUN SHAP (KernelExplainer)
# ==========================================
print("Computing SHAP values... (This may take a few minutes)")

# The explainer expects the model_loss_function and the background data
explainer = shap.KernelExplainer(model_loss_function, background)

# Calculate SHAP for a subset of Test data (e.g., the Anomalies)
# explaining first 50 test samples. This can be adjusted.
shap_values = explainer.shap_values(X_test_flat[0:50])

# ==========================================
# 4. HANDLING FEATURE NAMES
# ==========================================
# The feature_cols list from earlier processing contains the names of your microbial genera.
feature_names = feature_cols # Use the feature_cols defined earlier

# WE NEED TO AGGREGATE SHAP VALUES ACROSS TIMESTEPS
# Current shape of shap_values: (samples, timesteps * features)
# We want: (samples, features)
num_features = X_train.shape[2] # Use n_features from X_train shape
timesteps = SEQ_LEN # Use the globally defined sequence length

shap_values_reshaped = shap_values.reshape(-1, timesteps, num_features)
# Sum or Mean across timesteps to get importance of the bacteria regardless of time
shap_values_aggr = np.mean(shap_values_reshaped, axis=1)

# Same for the input data (for the color in the plot)
X_test_aggr = np.mean(X_test_flat[0:50].reshape(-1, timesteps, num_features), axis=1)

# ==========================================
# 5. PLOTTING
# ==========================================

# A. SUMMARY PLOT (Beeswarm) - The most important graph
plt.figure()
shap.summary_plot(
    shap_values_aggr,
    X_test_aggr,
    feature_names=feature_names, # List of your bacteria names
    max_display=20, # Only show top 20
    show=False
)
plt.title("Impact of Microbial Features on Anomaly Score")
plt.savefig("Fig4_SHAP_Beeswarm.pdf", dpi=600, bbox_inches='tight')
plt.show()

# B. GLOBAL BAR PLOT
plt.figure()
shap.summary_plot(
    shap_values_aggr,
    X_test_aggr,
    feature_names=feature_names,
    plot_type="bar",
    max_display=20,
    show=False
)
plt.title("Top 20 Drivers of Dysbiosis (Global Importance)")
plt.savefig("Fig5_SHAP_Bar.pdf", dpi=600, bbox_inches='tight')
plt.show()

# C. LOCAL EXPLANATION (Waterfall) - For one specific patient
# Let's pick a high-anomaly patient (e.g., index 0)
plt.figure()
shap.plots.waterfall(
    shap.Explanation(
        values=shap_values_aggr[0],
        base_values=explainer.expected_value,
        data=X_test_aggr[0],
        feature_names=feature_names
    ),
    max_display=10,
    show=False
)
plt.title("Local Explanation: Why was Patient 0 flagged?")
plt.savefig("Fig6_Local_Explanation.pdf", dpi=600, bbox_inches='tight')
plt.show()

# Task
Extract clinical markers (average `MaxTemperature`, average `NeutrophilCount`, and mode of `Consistency_liquid`) for each sequence in `df_test` (using a sequence length of `SEQ_LEN`), align these markers with the corresponding `test_mae` anomaly scores and `y_test` labels. Calculate and display the Pearson correlation coefficients between the `test_mae` and these clinical markers, and generate scatter plots to visualize the relationships between `test_mae` and `MaxTemperature` and `NeutrophilCount`.

## Extract clinical markers for test sequences

### Subtask:
Create a new function to iterate through the `df_test` DataFrame, grouping by `PatientID`, and for each sequence (of length `SEQ_LEN`), extract the average `MaxTemperature`, `NeutrophilCount`, and the mode of `Consistency_liquid`.


**Reasoning**:
The subtask requires defining a new function `extract_clinical_markers` to process `df_test` and extract clinical features using a sliding window. This function will calculate the mean for 'MaxTemperature' and 'NeutrophilCount' and the mode for 'Consistency_liquid' for each sequence.



In [None]:
def extract_clinical_markers(df, clinical_cols, seq_len):
    """
    Generates clinical markers for sequences based on patient groups and sliding windows.
    """
    extracted_features = []
    extracted_labels = []

    # Group by patient to ensure window never crosses patient boundaries
    for pid, group in df.groupby('PatientID'):
        # Sort by time
        group = group.sort_values('DayRelativeToNearestHCT')

        # Ensure all clinical columns are present in the group
        current_clinical_data = group[clinical_cols + ['DysbiosisLabel']].values

        if len(current_clinical_data) >= seq_len:
            for i in range(len(current_clinical_data) - seq_len + 1):
                window_data = current_clinical_data[i : i + seq_len]

                # Extract features for the current window
                temp_vals = window_data[:, clinical_cols.index('MaxTemperature')]
                neutro_vals = window_data[:, clinical_cols.index('NeutrophilCount')]
                consistency_vals = window_data[:, clinical_cols.index('Consistency_liquid')]

                # Calculate mean for continuous variables
                mean_temp = np.mean(temp_vals)
                mean_neutro = np.mean(neutro_vals)

                # Calculate mode for categorical/binary variable (Consistency_liquid)
                # Handle cases where there might be multiple modes, take the first one.
                # Ensure it's treated as integer for mode if it's 0/1
                mode_consistency = pd.Series(consistency_vals).mode()[0]

                # The label for the sequence (using the last label in the window)
                # or the max label in the window, to align with build_sequences logic
                sequence_label = int(np.max(window_data[:, -1])) # -1 is DysbiosisLabel

                extracted_features.append({
                    'PatientID': pid,
                    'SequenceIndex': i,
                    'AvgMaxTemperature': mean_temp,
                    'AvgNeutrophilCount': mean_neutro,
                    'ModeConsistencyLiquid': mode_consistency
                })
                extracted_labels.append(sequence_label)

    # Create a DataFrame from the extracted features and add the labels
    df_clinical_features = pd.DataFrame(extracted_features)
    df_clinical_features['DysbiosisLabel'] = extracted_labels
    return df_clinical_features

# Define clinical columns to extract
clinical_columns = ['MaxTemperature', 'NeutrophilCount', 'Consistency_liquid']

# Extract clinical markers for the test set
df_test_clinical = extract_clinical_markers(df_test, clinical_columns, SEQ_LEN)

print(f"Extracted clinical features for {len(df_test_clinical)} test sequences.")
print(df_test_clinical.head())

**Reasoning**:
The next step is to align the extracted clinical markers (`df_test_clinical`) with the anomaly scores (`test_mae`) and true labels (`y_test`). Then, I will combine them into a single DataFrame, calculate Pearson correlation coefficients, and generate scatter plots as requested by the subtask.



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure lengths match
if len(df_test_clinical) == len(test_mae) == len(y_test):
    print("Lengths of df_test_clinical, test_mae, and y_test match.")
else:
    print("Lengths do not match. Re-check sequence generation logic.")
    # Handle error or adjust data if lengths don't match, e.g., truncate
    min_len = min(len(df_test_clinical), len(test_mae), len(y_test))
    df_test_clinical = df_test_clinical.head(min_len)
    test_mae = test_mae[:min_len]
    y_test = y_test[:min_len]
    print(f"Adjusted all to minimum length: {min_len}")

# Add anomaly scores and true labels to the clinical features DataFrame
df_test_clinical['AnomalyScore_MAE'] = test_mae
df_test_clinical['TrueLabel_Dysbiosis'] = y_test

# --- Calculate Pearson Correlation Coefficients ---
print("\nPearson Correlation Coefficients with AnomalyScore_MAE:")
correlation_temp = df_test_clinical['AnomalyScore_MAE'].corr(df_test_clinical['AvgMaxTemperature'])
correlation_neutro = df_test_clinical['AnomalyScore_MAE'].corr(df_test_clinical['AvgNeutrophilCount'])

print(f"  AnomalyScore_MAE vs AvgMaxTemperature: {correlation_temp:.4f}")
print(f"  AnomalyScore_MAE vs AvgNeutrophilCount: {correlation_neutro:.4f}")

# --- Generate Scatter Plots ---
plt.figure(figsize=(15, 6))
sns.set_style("whitegrid")

# Plot 1: Anomaly Score vs. Average Max Temperature
plt.subplot(1, 2, 1)
sns.scatterplot(x='AvgMaxTemperature', y='AnomalyScore_MAE', hue='TrueLabel_Dysbiosis', data=df_test_clinical, alpha=0.6, palette='viridis')
plt.title('Anomaly Score (MAE) vs. Average Max Temperature', fontsize=14)
plt.xlabel('Average Max Temperature', fontsize=12)
plt.ylabel('Anomaly Score (MAE)', fontsize=12)
plt.legend(title='True Dysbiosis', loc='upper left')

# Plot 2: Anomaly Score vs. Average Neutrophil Count
plt.subplot(1, 2, 2)
sns.scatterplot(x='AvgNeutrophilCount', y='AnomalyScore_MAE', hue='TrueLabel_Dysbiosis', data=df_test_clinical, alpha=0.6, palette='viridis')
plt.title('Anomaly Score (MAE) vs. Average Neutrophil Count', fontsize=14)
plt.xlabel('Average Neutrophil Count', fontsize=12)
plt.ylabel('Anomaly Score (MAE)', fontsize=12)
plt.legend(title='True Dysbiosis', loc='upper right')

plt.tight_layout()
plt.savefig('Pearson Correlation Coefficients with AnomalyScore_MAE.pdf', dpi=600)
plt.show()

print("\nCombined DataFrame head:")
print(df_test_clinical.head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import os

# 1. Calculate Pearson correlation for AnomalyScore_MAE vs ModeConsistencyLiquid
correlation_consistency = df_test_clinical['AnomalyScore_MAE'].corr(
    df_test_clinical['ModeConsistencyLiquid']
)

# 2. Print this correlation coefficient
print(f"AnomalyScore_MAE vs ModeConsistencyLiquid: {correlation_consistency:.4f}")

# 3. Create a correlation matrix
correlation_cols = [
    'AnomalyScore_MAE',
    'AvgMaxTemperature',
    'AvgNeutrophilCount',
    'ModeConsistencyLiquid'
]
correlation_matrix = df_test_clinical[correlation_cols].corr()

print("\nCorrelation Matrix:")
print(correlation_matrix)

# 4. Generate a heatmap of this correlation matrix
sns.set_theme(style="white")  # clean style
plt.figure(figsize=(9, 7))

sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap="coolwarm",
    fmt=".2f",
    linewidths=0.7,
    linecolor="white",
    center=0,
    square=True,
    annot_kws={"size":12, "weight":"bold"}
)

# Make x-axis labels readable
plt.xticks(rotation=45, ha="right", fontsize=12)
plt.yticks(rotation=0, fontsize=12)

# Title
plt.title(
    'Pearson Correlation Matrix of Anomaly Score and Clinical Markers',
    fontsize=16, fontweight='bold', pad=20
)

plt.tight_layout()

# Save before showing
output_file = "Pearson_Correlation_Heatmap.pdf"
plt.savefig('Pearson_Correlation_Heatmap.pdf', dpi=600, bbox_inches='tight')
plt.close()

print(f"Figure saved successfully at: {os.path.abspath(output_file)}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set theme for aesthetics
sns.set_theme(style="white")

plt.figure(figsize=(9, 7))
sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap="mako",
    fmt=".2f",
    linewidths=0.7,
    linecolor="white",
    center=0,
    square=True,
    annot_kws={"size":12, "weight":"bold"}
)

# Flip axes: transpose the matrix
sns.heatmap(
    correlation_matrix.T,   # transpose flips x/y
    annot=True,
    cmap="mako",
    fmt=".2f",
    linewidths=0.7,
    linecolor="white",
    center=0,
    square=True,
    annot_kws={"size":12, "weight":"bold"}
)

# Make x-axis labels readable
plt.xticks(rotation=45, ha="right", fontsize=12)
plt.yticks(rotation=0, fontsize=12)

# Title
plt.title('Pearson Correlation Matrix\nAnomaly Score vs Clinical Markers',
          fontsize=16, fontweight='bold', pad=20)

plt.tight_layout()
sns.despine()
plt.savefig('Pearson_Correlation_Heatmap_Flipped.pdf', dpi=600)
plt.show()

In [None]:
df_test_clinical['AvgMaxTemperature_sq'] = df_test_clinical['AvgMaxTemperature']**2
df_test_clinical['AvgNeutrophilCount_sq'] = df_test_clinical['AvgNeutrophilCount']**2
df_test_clinical['Temp_x_Neutro'] = df_test_clinical['AvgMaxTemperature'] * df_test_clinical['AvgNeutrophilCount']
df_test_clinical['Temp_x_Consistency'] = df_test_clinical['AvgMaxTemperature'] * df_test_clinical['ModeConsistencyLiquid']
df_test_clinical['Neutro_x_Consistency'] = df_test_clinical['AvgNeutrophilCount'] * df_test_clinical['ModeConsistencyLiquid']

print("Updated df_test_clinical with polynomial and interaction features:")
print(df_test_clinical.head())

In [None]:
new_features = [
    'AvgMaxTemperature_sq',
    'AvgNeutrophilCount_sq',
    'Temp_x_Neutro',
    'Temp_x_Consistency',
    'Neutro_x_Consistency'
]

print("\nPearson Correlation Coefficients with AnomalyScore_MAE (New Features):")
for feature in new_features:
    correlation = df_test_clinical['AnomalyScore_MAE'].corr(df_test_clinical[feature])
    print(f"  AnomalyScore_MAE vs {feature}: {correlation:.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Identify features with the highest absolute correlation from the previous step
# AnomalyScore_MAE vs Temp_x_Consistency: 0.2125
# AnomalyScore_MAE vs Temp_x_Neutro: -0.1961

plt.figure(figsize=(15, 6))
sns.set_style("whitegrid")

# Plot 1: Anomaly Score vs. Temp_x_Consistency
plt.subplot(1, 2, 1)
sns.scatterplot(x='Temp_x_Consistency', y='AnomalyScore_MAE', hue='TrueLabel_Dysbiosis', data=df_test_clinical, alpha=0.6, palette='viridis')
plt.title('Anomaly Score (MAE) vs. Temp x Consistency (Interaction)', fontsize=14)
plt.xlabel('Average Max Temperature x Mode Consistency Liquid', fontsize=12)
plt.ylabel('Anomaly Score (MAE)', fontsize=12)
plt.legend(title='True Dysbiosis', loc='upper left')

# Plot 2: Anomaly Score vs. Temp_x_Neutro
plt.subplot(1, 2, 2)
sns.scatterplot(x='Temp_x_Neutro', y='AnomalyScore_MAE', hue='TrueLabel_Dysbiosis', data=df_test_clinical, alpha=0.6, palette='viridis')
plt.title('Anomaly Score (MAE) vs. Temp x Neutrophil (Interaction)', fontsize=14)
plt.xlabel('Average Max Temperature x Average Neutrophil Count', fontsize=12)
plt.ylabel('Anomaly Score (MAE)', fontsize=12)
plt.legend(title='True Dysbiosis', loc='upper left')

plt.tight_layout()
plt.savefig('AnomalyScore_ClinicalInteractions_Scatter.pdf', dpi=600)
plt.show()

In [None]:
# @title 19. Generate Publication-Ready ROC & PR Curves (Enhanced)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score

# --- PLOT SETUP ---
# Use a professional style and size
plt.rcParams.update({
    'font.family': 'serif',
    'font.size': 14,
    'axes.labelsize': 16,
    'axes.titlesize': 18,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 12,
    'lines.linewidth': 2
})

# Create Figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8), dpi=600)

# --- STYLE CONFIGURATION ---
# Define precise styles to highlight your model vs. baselines
model_styles = {
    # PROPOSED MODEL (The Star) -> Bold, Solid, Red/Black
    'Stacked (LR)':        {'color': '#D62728', 'ls': '-',  'lw': 4.0, 'zorder': 100}, # Red

    # SOTA COMPETITOR -> Bold, Solid, Blue
    'CNN-LSTM (SKD)':      {'color': '#1F77B4', 'ls': '-',  'lw': 3.0, 'zorder': 90},  # Blue

    # STRONG ENSEMBLES -> Medium, Dashed
    'Averaged Ensemble':   {'color': 'black',   'ls': '--', 'lw': 2.5, 'zorder': 80},
    'Weighted Ensemble':   {'color': '#2CA02C', 'ls': '--', 'lw': 2.0, 'zorder': 70}, # Green

    # BASELINES -> Thinner, Gray/Muted
    'LSTM-AE Baseline':    {'color': 'gray',    'ls': ':',  'lw': 2.5, 'zorder': 60},
    'One-Class SVM':       {'color': '#E377C2', 'ls': ':',  'lw': 2.0, 'zorder': 50}, # Pink

    # INDIVIDUAL CLASSIFIERS -> Thin, Semi-transparent
    'Logistic Regression': {'color': '#7F7F7F', 'ls': '-',  'lw': 1.0, 'zorder': 40}, # Gray
    'Random Forest':       {'color': '#FF7F0E', 'ls': '-',  'lw': 1.5, 'zorder': 45}, # Orange
    'XGBoost':             {'color': '#8C564B', 'ls': '-',  'lw': 1.0, 'zorder': 40}, # Brown
    'MLP':                 {'color': '#9467BD', 'ls': '-',  'lw': 1.0, 'zorder': 40}, # Purple
    'KNN':                 {'color': '#BCBD22', 'ls': '-',  'lw': 1.0, 'zorder': 40}, # Olive
    'Stacked (XGB)':       {'color': '#17BECF', 'ls': ':',  'lw': 1.5, 'zorder': 45}, # Cyan
}

# Ensure all models in your list have a style (default fallback)
default_style = {'color': 'gray', 'ls': '-', 'lw': 1.0, 'zorder': 10}

# Define models to plot (Order matters for legend: Top performers first usually)
all_models_ordered = [
    'Stacked (LR)',          # Your Best
    'CNN-LSTM (SKD)',        # Benchmark
    'Averaged Ensemble',
    'Random Forest',         # Best Single
    'LSTM-AE Baseline',      # Unsupervised Baseline
    'One-Class SVM',
    'Logistic Regression',
    'XGBoost',
    'MLP',
    'KNN',
    'Weighted Ensemble',
    'Stacked (XGB)'
]

# Check which models effectively exist in your dictionaries
valid_models = []
for m in all_models_ordered:
    if m in best_models or m in test_probs_dict or m == 'LSTM-AE Baseline':
        valid_models.append(m)


# --- PLOT LOOP ---
for name in valid_models:
    style = model_styles.get(name, default_style)

    # GET SCORES
    if name == 'LSTM-AE Baseline':
        scores = test_mae
    elif name == 'One-Class SVM':
        scores = -best_models[name].decision_function(X_test_feat)
    elif name in test_probs_dict:
        scores = test_probs_dict[name]
    elif name in best_models:
        scores = best_models[name].predict_proba(X_test_feat)[:, 1]
    else:
        continue # Skip if data missing

    # CALCULATE METRICS
    fpr, tpr, _ = roc_curve(y_test, scores)
    roc_val = roc_auc_score(y_test, scores)

    precision, recall, _ = precision_recall_curve(y_test, scores)
    pr_val = average_precision_score(y_test, scores)

    # PLOT ROC
    ax1.plot(fpr, tpr,
             label=f'{name} (AUC={roc_val:.3f})',
             color=style['color'],
             linestyle=style['ls'],
             linewidth=style['lw'],
             zorder=style['zorder'],
             alpha=0.85)

    # PLOT PR
    ax2.plot(recall, precision,
             label=f'{name} (AP={pr_val:.3f})',
             color=style['color'],
             linestyle=style['ls'],
             linewidth=style['lw'],
             zorder=style['zorder'],
             alpha=0.85)

# --- FORMATTING AXIS 1 (ROC) ---
ax1.plot([0, 1], [0, 1], 'k--', alpha=0.3, linewidth=1.5) # Diagonal
ax1.set_xlim([-0.01, 1.01])
ax1.set_ylim([-0.01, 1.02])
ax1.set_xlabel('False Positive Rate (1 - Specificity)', fontweight='bold')
ax1.set_ylabel('True Positive Rate (Sensitivity)', fontweight='bold')
ax1.set_title('A. Receiver Operating Characteristic (ROC)', fontweight='bold', pad=15)
ax1.grid(True, alpha=0.2, linestyle='--')
# Legend: placed inside, but you can move to 'lower right' or outside
ax1.legend(loc="lower right", frameon=True, framealpha=0.9, edgecolor='gray', fontsize=11)

# --- FORMATTING AXIS 2 (PR) ---
ax2.set_xlim([-0.01, 1.01])
ax2.set_ylim([-0.01, 1.02])
ax2.set_xlabel('Recall (Sensitivity)', fontweight='bold')
ax2.set_ylabel('Precision (PPV)', fontweight='bold')
ax2.set_title('B. Precision-Recall (PR) Curves', fontweight='bold', pad=15)
ax2.grid(True, alpha=0.2, linestyle='--')
# Legend: placed 'lower left' or 'upper right' for PR usually
ax2.legend(loc="lower left", frameon=True, framealpha=0.9, edgecolor='gray', fontsize=11)

# --- SAVE & SHOW ---
plt.tight_layout()
plt.savefig('Figure_Combined_ROCPR_Analysis_Enhanced.pdf', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
# @title 20. Generate Targeted ROC & PR Comparisons (Supervised vs. Unsupervised)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score

# --- SHARED PLOTTING FUNCTION ---
def plot_rocp_comparison(model_list, title_suffix, filename_suffix, ax_roc, ax_pr):
    """
    Plots ROC and PR curves for a specific list of models on given axes.
    """
    # 1. STYLE DEFINITIONS
    styles = {
        'Stacked (LR)':        {'color': '#D62728', 'ls': '-',  'lw': 3.5, 'zorder': 100}, # Red (Proposed)

        # Supervised
        'CNN-LSTM (SKD)':      {'color': '#1F77B4', 'ls': '-',  'lw': 2.5, 'zorder': 90},  # Dark Blue
        'SelectKBest + LSTM':  {'color': '#6BAED6', 'ls': '--', 'lw': 2.0, 'zorder': 80},  # Med Blue
        'Sparse AE + LSTM':    {'color': '#9ECAE1', 'ls': ':',  'lw': 2.0, 'zorder': 70},  # Light Blue

        # Unsupervised
        'Local Outlier Factor':{'color': '#7F7F7F', 'ls': '-',  'lw': 2.5, 'zorder': 90},  # Dark Gray
        'Isolation Forest':    {'color': '#C7C7C7', 'ls': '--', 'lw': 2.0, 'zorder': 80},  # Light Gray
    }
    default_style = {'color': 'gray', 'ls': '-', 'lw': 1.0, 'zorder': 10}

    # 2. PLOT LOOP
    for name in model_list:
        style = styles.get(name, default_style)
        scores = None

        # --- SCORE RETRIEVAL LOGIC ---
        # All scores are now expected to be in test_probs_dict
        if name in test_probs_dict:
            scores = test_probs_dict[name]

        if scores is None:
            print(f"Warning: Scores for {name} not found.")
            continue

        # 3. CALCULATE METRICS
        fpr, tpr, _ = roc_curve(y_test, scores)
        roc_val = roc_auc_score(y_test, scores)
        precision, recall, _ = precision_recall_curve(y_test, scores)
        pr_val = average_precision_score(y_test, scores)

        # 4. PLOT ROC
        ax_roc.plot(fpr, tpr, label=f'{name} (AUC={roc_val:.3f})',
                    color=style['color'], ls=style['ls'], lw=style['lw'], zorder=style['zorder'], alpha=0.9)

        # 5. PLOT PR
        ax_pr.plot(recall, precision, label=f'{name} (AP={pr_val:.3f})',
                   color=style['color'], ls=style['ls'], lw=style['lw'], zorder=style['zorder'], alpha=0.9)

    # --- FORMATTING ---
    # ROC Settings
    ax_roc.plot([0, 1], [0, 1], 'k--', alpha=0.3, label='Chance')
    ax_roc.set_xlabel('False Positive Rate', fontweight='bold')
    ax_roc.set_ylabel('True Positive Rate', fontweight='bold')
    ax_roc.set_title(f'ROC: {title_suffix}', fontweight='bold', pad=10)
    ax_roc.legend(loc="lower right", frameon=True, framealpha=0.9, edgecolor='white')
    ax_roc.spines['top'].set_visible(False)
    ax_roc.spines['right'].set_visible(False)
    ax_roc.grid(True, alpha=0.2, linestyle='--')

    # PR Settings
    prevalence = np.sum(y_test) / len(y_test)
    ax_pr.plot([0, 1], [prevalence, prevalence], 'k--', alpha=0.3, label=f'Base ({prevalence:.2f})')
    ax_pr.set_xlabel('Recall', fontweight='bold')
    ax_pr.set_ylabel('Precision', fontweight='bold')
    ax_pr.set_title(f'PR: {title_suffix}', fontweight='bold', pad=10)
    ax_pr.legend(loc="lower left", frameon=True, framealpha=0.9, edgecolor='white')
    ax_pr.spines['top'].set_visible(False)
    ax_pr.spines['right'].set_visible(False)
    ax_pr.grid(True, alpha=0.2, linestyle='--')

# --- CONFIGURATION ---
plt.rcParams.update({'font.family': 'serif', 'font.size': 12})

# ==========================================
# FIGURE 1: DYNABIOME vs. SUPERVISED BASELINES
# ==========================================
fig1, (ax1a, ax1b) = plt.subplots(1, 2, figsize=(16, 7), dpi=600)
supervised_list = [
    'Stacked (LR)',          # Proposed
    'CNN-LSTM (SKD)',        # SOTA (named consistently)
    'SelectKBest + LSTM',
    'Sparse AE + LSTM'
]
plot_rocp_comparison(supervised_list, "Dynabiome (Stacked LR) vs. Supervised Models", "Supervised", ax1a, ax1b)
plt.tight_layout()
plt.savefig('Figure_Comparison_Supervised.pdf', bbox_inches='tight')
plt.show()

# ==========================================
# FIGURE 2: DYNABIOME vs. UNSUPERVISED BASELINES
# ==========================================
fig2, (ax2a, ax2b) = plt.subplots(1, 2, figsize=(16, 7), dpi=600)
unsupervised_list = [
    'Stacked (LR)',          # Proposed
    'Local Outlier Factor',  # Best Unsupervised
    'Isolation Forest'
]
plot_rocp_comparison(unsupervised_list, "Dynabiome (Stacked LR) vs. Unsupervised Models", "Unsupervised", ax2a, ax2b)
plt.tight_layout()
plt.savefig('Figure_Comparison_Unsupervised.pdf', bbox_inches='tight')
plt.show()