In [None]:
!pip install -q lime==0.2.0.1 shap==0.46.0 \
               librosa==0.10.2.post1 soundfile==0.12.1 \
               scikit-learn==1.6.0 matplotlib==3.9.0 torchsummary==1.5.1


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m543.9/543.9 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.1/260.1 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m38.5 M

In [None]:
# Alzheimer's Detection (100-per-class train) + Explainability + AUC-ROC + Training Metrics
import os, random, warnings
import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt

from tqdm import tqdm
from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lime.lime_text import LimeTextExplainer
import shap

warnings.filterwarnings("ignore")

# Mount Drive
drive.mount('/content/drive')

# Paths
data_dir = "/content/drive/MyDrive/Alzheimers_Organized"
control_path = os.path.join(data_dir, "control")
dementia_path = os.path.join(data_dir, "dementia")


# Audio Feature Extraction Utilities
def extract_audio_features(audio_path, sr=22050):
    try:
        y, sr = librosa.load(audio_path, sr=sr)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20).T, axis=0)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
        contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
        centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
        rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        rms = np.mean(librosa.feature.rms(y=y))
        zcr = np.mean(librosa.feature.zero_crossing_rate(y=y))
        return np.hstack([mfccs, chroma, contrast, [centroid, bandwidth, rolloff, rms, zcr]])
    except Exception:
        return np.zeros(44, dtype=float)


def collect_wavs_and_texts(root_dir):
    pairs = []
    for subj in os.listdir(root_dir):
        subj_dir = os.path.join(root_dir, subj)
        if not os.path.isdir(subj_dir):
            continue
        wavs = [w for w in os.listdir(subj_dir) if w.lower().endswith(".wav")]
        for w in wavs:
            wav_path = os.path.join(subj_dir, w)
            txt_path = wav_path[:-4] + ".txt"
            text = ""
            if os.path.exists(txt_path):
                try:
                    with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
                        text = f.read().lower()
                except:
                    text = ""
            pairs.append((wav_path, text))
    return pairs


def build_features(pairs):
    feats, texts = [], []
    for wav_path, text in tqdm(pairs, desc="Extracting audio features"):
        feats.append(extract_audio_features(wav_path))
        texts.append(text)
    return np.array(feats), texts


def save_spectrogram(audio_path, save_path, title):
    y, sr = librosa.load(audio_path, sr=None)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_db = librosa.power_to_db(S, ref=np.max)
    plt.figure(figsize=(8, 3.5))
    librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format="%+2.0f dB")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path, dpi=150)
    plt.close()


# Dataset Preparation
random.seed(42)

control_pairs = collect_wavs_and_texts(control_path)
dementia_pairs = collect_wavs_and_texts(dementia_path)

print(f"Control files:  {len(control_pairs)}")
print(f"Dementia files: {len(dementia_pairs)}")

random.shuffle(control_pairs)
random.shuffle(dementia_pairs)

K = 100
control_train = control_pairs[:min(K, len(control_pairs))]
control_unseen = control_pairs[min(K, len(control_pairs)):]
dementia_train = dementia_pairs[:min(K, len(dementia_pairs))]
dementia_unseen = dementia_pairs[min(K, len(dementia_pairs)):]

print(f"Control: train={len(control_train)}, unseen={len(control_unseen)}")
print(f"Dementia: train={len(dementia_train)}, unseen={len(dementia_unseen)}")

# Extract audio + text
X_audio_train_control, texts_train_control = build_features(control_train)
X_audio_train_dementia, texts_train_dementia = build_features(dementia_train)
X_audio_unseen_control, texts_unseen_control = build_features(control_unseen)
X_audio_unseen_dementia, texts_unseen_dementia = build_features(dementia_unseen)

X_audio_train = np.vstack([X_audio_train_control, X_audio_train_dementia])
y_train = np.array([0]*len(X_audio_train_control) + [1]*len(X_audio_train_dementia))
texts_train = texts_train_control + texts_train_dementia

X_audio_unseen = np.vstack([X_audio_unseen_control, X_audio_unseen_dementia])
y_unseen = np.array([0]*len(X_audio_unseen_control) + [1]*len(X_audio_unseen_dementia))
texts_unseen = texts_unseen_control + texts_unseen_dementia

print(f"Train: {X_audio_train.shape}, Unseen: {X_audio_unseen.shape}")

# TF-IDF text features
vectorizer = TfidfVectorizer(max_features=300)
X_text_train = vectorizer.fit_transform(texts_train).toarray()
X_text_unseen = vectorizer.transform(texts_unseen).toarray()

# Combine audio + text features
X_train = np.hstack([X_audio_train, X_text_train])
X_unseen = np.hstack([X_audio_unseen, X_text_unseen])

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_unseen = scaler.transform(X_unseen)

# Model Training & Evaluation (No LightGBM)
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM (RBF)": SVC(kernel='rbf', probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

explain_dir = "/content/drive/MyDrive/Alzheimers_Explainability"
os.makedirs(explain_dir, exist_ok=True)

history = {}

for name, model in models.items():
    print("\nModel:", name)
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    model.fit(X_tr, y_tr)

    train_preds = model.predict(X_tr)
    val_preds = model.predict(X_val)
    test_preds = model.predict(X_unseen)
    test_probs = model.predict_proba(X_unseen)[:,1] if hasattr(model, "predict_proba") else np.zeros(len(test_preds))

    train_acc = accuracy_score(y_tr, train_preds)
    val_acc = accuracy_score(y_val, val_preds)
    test_acc = accuracy_score(y_unseen, test_preds)
    auc = roc_auc_score(y_unseen, test_probs) if len(np.unique(y_unseen)) > 1 else np.nan

    print(f"Training Accuracy: {train_acc:.3f}")
    print(f"Validation Accuracy: {val_acc:.3f}")
    print(f"Testing Accuracy: {test_acc:.3f}")
    print(f"AUC-ROC: {auc:.3f}")

    if len(np.unique(y_unseen)) > 1:
        fpr, tpr, _ = roc_curve(y_unseen, test_probs)
        plt.figure()
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.2f})")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"AUC-ROC Curve: {name}")
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(explain_dir, f"auc_{name.replace(' ', '_')}.png"))
        plt.close()

    history[name] = {"train_acc": train_acc, "val_acc": val_acc}

# Accuracy Visualization
plt.figure(figsize=(8, 4))
plt.bar(range(len(history)), [history[m]["train_acc"] for m in history], width=0.4, label="Training Accuracy")
plt.bar(np.arange(len(history)) + 0.4, [history[m]["val_acc"] for m in history], width=0.4, label="Validation Accuracy")
plt.xticks(np.arange(len(history)) + 0.2, list(history.keys()), rotation=45, ha="right")
plt.ylabel("Accuracy")
plt.title("Training vs Validation Accuracy")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(explain_dir, "train_val_accuracy.png"))
plt.close()

# Explainability Section (LIME + SHAP + Spectrogram)
ctrl_example = control_unseen[0][0] if control_unseen else control_train[0][0]
dem_example = dementia_unseen[0][0] if dementia_unseen else dementia_train[0][0]

if ctrl_example:
    save_spectrogram(ctrl_example, os.path.join(explain_dir, "spectrogram_control.png"), "Mel Spectrogram (Control)")
if dem_example:
    save_spectrogram(dem_example, os.path.join(explain_dir, "spectrogram_dementia.png"), "Mel Spectrogram (Dementia)")

# LIME for Text
text_pipeline = make_pipeline(TfidfVectorizer(max_features=300), LogisticRegression(max_iter=1000))
text_pipeline.fit(texts_train, y_train)

lime_explainer = LimeTextExplainer(class_names=["Control", "Dementia"])
sample_text = texts_unseen[0] if len(texts_unseen) > 0 else texts_train[0]
lime_exp = lime_explainer.explain_instance(sample_text, text_pipeline.predict_proba, num_features=10)
lime_exp.save_to_file(os.path.join(explain_dir, "lime_text_explanation.html"))

# SHAP for Text
tfidf = text_pipeline.named_steps["tfidfvectorizer"]
logreg = text_pipeline.named_steps["logisticregression"]
X_bg = tfidf.transform(texts_train[:100])
explainer = shap.LinearExplainer(logreg, X_bg, feature_perturbation="interventional")

X_sample = tfidf.transform([sample_text])
shap_values = explainer.shap_values(X_sample)
vals = shap_values[1] if isinstance(shap_values, list) else shap_values
vals = np.array(vals).ravel()

feature_names = tfidf.get_feature_names_out()
idx = np.argsort(np.abs(vals))[-15:][::-1]
top_features = [feature_names[i] for i in idx]
top_values = vals[idx]

plt.figure(figsize=(8, 4))
plt.barh(range(len(top_features))[::-1], top_values[np.argsort(np.arange(len(top_features))[::-1])])
plt.yticks(range(len(top_features))[::-1], top_features[::-1])
plt.xlabel("SHAP value (word contribution)")
plt.title("Top word contributions (SHAP: LogisticRegression + TF-IDF)")
plt.tight_layout()
plt.savefig(os.path.join(explain_dir, "shap_text_bar.png"), dpi=150)
plt.close()

print("Explainability visualizations and evaluation results saved in Google Drive.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Control files:  237
Dementia files: 131
Control: train=100, unseen=137
Dementia: train=100, unseen=31


Extracting audio features: 100%|██████████| 100/100 [02:01<00:00,  1.21s/it]
Extracting audio features: 100%|██████████| 100/100 [01:52<00:00,  1.13s/it]
Extracting audio features: 100%|██████████| 137/137 [02:32<00:00,  1.11s/it]
Extracting audio features: 100%|██████████| 31/31 [00:46<00:00,  1.49s/it]


Train: (200, 44), Unseen: (168, 44)

Model: Logistic Regression
Training Accuracy: 1.000
Validation Accuracy: 0.450
Testing Accuracy: 0.506
AUC-ROC: 0.518

Model: SVM (RBF)
Training Accuracy: 0.994
Validation Accuracy: 0.475
Testing Accuracy: 0.625
AUC-ROC: 0.655

Model: Random Forest
Training Accuracy: 1.000
Validation Accuracy: 0.525
Testing Accuracy: 0.601
AUC-ROC: 0.700

Model: KNN
Training Accuracy: 0.556
Validation Accuracy: 0.500
Testing Accuracy: 0.202
AUC-ROC: 0.444

Model: Gradient Boosting
Training Accuracy: 1.000
Validation Accuracy: 0.525
Testing Accuracy: 0.518
AUC-ROC: 0.578

Model: XGBoost
Training Accuracy: 1.000
Validation Accuracy: 0.575
Testing Accuracy: 0.673
AUC-ROC: 0.670
Explainability visualizations and evaluation results saved in Google Drive.


In [None]:
from sklearn.metrics import log_loss

# Calculate training and validation losses for all trained models
train_losses = []
val_losses = []
model_names = list(models.keys())

for name, model in models.items():
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Check if model supports probability output
    if hasattr(model, "predict_proba"):
        train_probs = model.predict_proba(X_tr)
        val_probs = model.predict_proba(X_val)
        tr_loss = log_loss(y_tr, train_probs)
        va_loss = log_loss(y_val, val_probs)
    else:
        # Approximate using predictions (for SVM or models without predict_proba)
        tr_preds = model.predict(X_tr)
        va_preds = model.predict(X_val)
        tr_loss = np.mean(tr_preds != y_tr)
        va_loss = np.mean(va_preds != y_val)

    train_losses.append(tr_loss)
    val_losses.append(va_loss)
    print(f"{name}: Training Loss = {tr_loss:.4f}, Validation Loss = {va_loss:.4f}")

# Plot the Training vs Validation Loss
plt.figure(figsize=(8, 5))
plt.plot(model_names, train_losses, marker='o', label='Training Loss')
plt.plot(model_names, val_losses, marker='s', label='Validation Loss')
plt.xlabel("Models")
plt.ylabel("Loss (Cross-Entropy / Proxy)")
plt.title("Training vs Validation Loss Comparison")
plt.legend()
plt.grid(True, linestyle='--', linewidth=0.5)
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(explain_dir, "train_val_loss.png"), dpi=150)
plt.close()

print("Training and validation loss plot saved as 'train_val_loss.png' in the Explainability folder.")


Logistic Regression: Training Loss = 0.0214, Validation Loss = 1.8780
SVM (RBF): Training Loss = 0.2753, Validation Loss = 0.6829
Random Forest: Training Loss = 0.2018, Validation Loss = 0.6886
KNN: Training Loss = 0.5820, Validation Loss = 0.9006
Gradient Boosting: Training Loss = 0.0744, Validation Loss = 0.7839
XGBoost: Training Loss = 0.0201, Validation Loss = 0.9382
Training and validation loss plot saved as 'train_val_loss.png' in the Explainability folder.
