<a href="https://colab.research.google.com/github/DMadhumita2904/Multimodal-Deep-Learning-System-for-Early-Alzheimer-s-Detection/blob/main/Alzheimers_Detection_Using_Speech_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [117]:
import pandas as pd

df = pd.read_csv("/content/addetector_dataset (1).csv")

print("Dataset shape:", df.shape)
print(df.head())
print(df.columns)

Dataset shape: (1010, 66)
   duration_sec  chunk_count    mfcc_1    mfcc_2    mfcc_3    mfcc_4  \
0     10.043262         35.0  0.170832  1.260207  0.293262 -1.183015   
1     22.114289         19.0  0.454972  0.455061  0.760773  1.664440   
2      7.196703         13.0  0.217214 -0.346633 -1.924812 -0.078618   
3      8.470619         27.0  0.955154  2.110354 -1.168544  1.280734   
4      5.067772         23.0  0.079000  1.079544  1.425937  1.308224   

     mfcc_5    mfcc_6    mfcc_7    mfcc_8  ...  linguistic_feat_42  \
0 -0.521902  0.564814  0.710332  0.855109  ...           -0.071778   
1  1.561906 -0.594606 -2.097895 -0.678529  ...           -0.328909   
2  3.254647 -1.244458 -1.814873  1.936389  ...            0.196236   
3  0.381839  0.163061 -0.007729 -1.384647  ...           -0.346869   
4 -0.528279  1.014105  0.211641  1.725908  ...           -0.010732   

   linguistic_feat_43  linguistic_feat_44  linguistic_feat_45  \
0           -0.404672            0.450733            0.

In [118]:
target_col = "label"

df[target_col] = df[target_col].astype(str)
df[target_col] = df[target_col].replace({
    "Control": 0,
    "Healthy": 0,
    "ProbableAD": 1,
    "PossibleAD": 1,
    "Dementia": 1
})

X = df.select_dtypes(include=["number"]).drop(columns=[target_col], errors="ignore")
y = df[target_col]

print("Features:", X.shape)
print("Classes:", y.value_counts())

Features: (1010, 65)
Classes: label
0    560
1    450
Name: count, dtype: int64


In [119]:
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=[target_col])
df[target_col] = df[target_col].astype(int)

In [120]:
non_features = [
    "File","Language","Corpus","Code","Group",
    "Sex","Race","Role","Custom_field"
]

df = df.drop(columns=non_features, errors="ignore")

In [121]:
from sklearn.impute import SimpleImputer

X = df.select_dtypes(include=["number"]).drop(columns=[target_col])
y = df[target_col]

imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)

In [122]:
print(y.value_counts())

label
0    560
1    450
Name: count, dtype: int64


In [123]:
from imblearn.over_sampling import SMOTE
X, y = SMOTE(random_state=42).fit_resample(X, y)

In [124]:
print(y.value_counts())

label
1    560
0    560
Name: count, dtype: int64


**MACHINE LEARNING ALGORITHMS**

In [125]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=30)
X = selector.fit_transform(X, y)

In [126]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [127]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

In [128]:
from sklearn.svm import SVC

svm = SVC(kernel="rbf", probability=True)
svm.fit(X_train, y_train)

In [129]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

In [130]:
from sklearn.metrics import accuracy_score, classification_report

models = {
    "Logistic Regression": lr,
    "SVM": svm,
    "Random Forest": rf
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print("\n==============================")
    print(name)
    print("Accuracy:", round(acc, 4))
    print(classification_report(y_test, y_pred))


Logistic Regression
Accuracy: 0.5804
              precision    recall  f1-score   support

           0       0.59      0.51      0.55       112
           1       0.57      0.65      0.61       112

    accuracy                           0.58       224
   macro avg       0.58      0.58      0.58       224
weighted avg       0.58      0.58      0.58       224


SVM
Accuracy: 0.6027
              precision    recall  f1-score   support

           0       0.60      0.62      0.61       112
           1       0.61      0.58      0.59       112

    accuracy                           0.60       224
   macro avg       0.60      0.60      0.60       224
weighted avg       0.60      0.60      0.60       224


Random Forest
Accuracy: 0.5938
              precision    recall  f1-score   support

           0       0.59      0.61      0.60       112
           1       0.60      0.58      0.59       112

    accuracy                           0.59       224
   macro avg       0.59      0.59   

In [131]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=3,
    subsample=0.9,
    random_state=42
)

gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Gradient Boosting Accuracy: 0.5357142857142857
              precision    recall  f1-score   support

           0       0.54      0.46      0.50       112
           1       0.53      0.62      0.57       112

    accuracy                           0.54       224
   macro avg       0.54      0.54      0.53       224
weighted avg       0.54      0.54      0.53       224



**DATA AUGUMENTATION**

In [132]:
import numpy as np

noise = np.random.normal(0, 0.01, X.shape)
X_aug = np.vstack([X, X + noise])
y_aug = np.hstack([y, y])

print("Augmented size:", X_aug.shape)

Augmented size: (2240, 30)


In [133]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_aug, y_aug,
    test_size=0.2,
    random_state=42,
    stratify=y_aug
)

In [134]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [135]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression(max_iter=2000)
svm = SVC(kernel="rbf", probability=True)
rf = RandomForestClassifier(n_estimators=300, random_state=42)

lr.fit(X_train, y_train)
svm.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [136]:
from sklearn.metrics import accuracy_score, classification_report

models = {
    "Logistic Regression": lr,
    "SVM": svm,
    "Random Forest": rf
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print("\n====================")
    print(name)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


Logistic Regression
Accuracy: 0.5915178571428571
              precision    recall  f1-score   support

           0       0.59      0.58      0.59       224
           1       0.59      0.61      0.60       224

    accuracy                           0.59       448
   macro avg       0.59      0.59      0.59       448
weighted avg       0.59      0.59      0.59       448


SVM
Accuracy: 0.8125
              precision    recall  f1-score   support

           0       0.80      0.83      0.82       224
           1       0.82      0.79      0.81       224

    accuracy                           0.81       448
   macro avg       0.81      0.81      0.81       448
weighted avg       0.81      0.81      0.81       448


Random Forest
Accuracy: 0.9084821428571429
              precision    recall  f1-score   support

           0       0.94      0.87      0.90       224
           1       0.88      0.95      0.91       224

    accuracy                           0.91       448
   macro avg

**DEEP LEARNING ALGORITHMS**


In [202]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_aug, y_aug,
    test_size=0.2,
    random_state=42,
    stratify=y_aug
)

In [203]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = {0: weights[0], 1: weights[1]}

In [194]:
# import numpy as np
# import tensorflow as tf
# import random
# import os

# # 1. Set a fixed seed for Python random
# random.seed(42)

# # 2. Set a fixed seed for NumPy
# np.random.seed(42)

# # 3. Set a fixed seed for TensorFlow/Keras
# tf.random.set_seed(42)



In [204]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [205]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

In [206]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(64, activation='relu'),

    tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [207]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

In [208]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=150,
    batch_size=16,
    callbacks=[early_stop],
    class_weight=class_weights,
    verbose=1
)

Epoch 1/150
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.5094 - auc_9: 0.5234 - loss: 0.8147 - val_accuracy: 0.6156 - val_auc_9: 0.7028 - val_loss: 0.6563
Epoch 2/150
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8547 - auc_9: 0.9273 - loss: 0.4313 - val_accuracy: 0.7298 - val_auc_9: 0.7880 - val_loss: 0.5934
Epoch 3/150
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9326 - auc_9: 0.9840 - loss: 0.3094 - val_accuracy: 0.7716 - val_auc_9: 0.8390 - val_loss: 0.5227
Epoch 4/150
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9736 - auc_9: 0.9967 - loss: 0.2135 - val_accuracy: 0.7967 - val_auc_9: 0.8674 - val_loss: 0.4674
Epoch 5/150
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9922 - auc_9: 0.9999 - loss: 0.1370 - val_accuracy: 0.8022 - val_auc_9: 0.8829 - val_loss: 0.4369
Epoch 6/150
[1

In [209]:
loss, acc, auc = model.evaluate(X_test, y_test, verbose=0)
print("Deep Learning Accuracy:", round(acc * 100, 2), "%")

Deep Learning Accuracy: 76.34 %


In [210]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).ravel()

print("\n======================")
print("Deep Learning Model")
print("Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Deep Learning Model
Accuracy: 76.34 %

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.69      0.75       224
           1       0.73      0.83      0.78       224

    accuracy                           0.76       448
   macro avg       0.77      0.76      0.76       448
weighted avg       0.77      0.76      0.76       448

Confusion Matrix:
[[155  69]
 [ 37 187]]


**DEEP LEARNING ALGORITHM AND ENSEMBLE METHODS**

In [233]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

selector_model = RandomForestClassifier(n_estimators=300)
selector_model.fit(X, y)

selector = SelectFromModel(selector_model, threshold="median")
X_selected = selector.fit_transform(X, y)

In [234]:
import numpy as np
import tensorflow as tf
import random
import os

# 1. Set a fixed seed for Python random
random.seed(42)

# 2. Set a fixed seed for NumPy
np.random.seed(42)

# 3. Set a fixed seed for TensorFlow/Keras
tf.random.set_seed(42)

# 4. Optional: Force deterministic algorithms (may be slower)
os.environ['TF_DETERMINISTIC_OPS'] = '1'


In [236]:
import tensorflow as tf
from tensorflow.keras import layers

model = tf.keras.Sequential([
    layers.Dense(256, activation="relu", input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(128, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(64, activation="relu"),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(patience=5)
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.4968 - loss: 0.9740 - val_accuracy: 0.5469 - val_loss: 0.6864 - learning_rate: 5.0000e-04
Epoch 2/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5908 - loss: 0.7475 - val_accuracy: 0.5915 - val_loss: 0.6725 - learning_rate: 5.0000e-04
Epoch 3/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6396 - loss: 0.6727 - val_accuracy: 0.6518 - val_loss: 0.6495 - learning_rate: 5.0000e-04
Epoch 4/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6716 - loss: 0.6255 - val_accuracy: 0.6629 - val_loss: 0.6265 - learning_rate: 5.0000e-04
Epoch 5/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6586 - loss: 0.6239 - val_accuracy: 0.6674 - val_loss: 0.6067 - learning_rate: 5.0000e-04
Epoch 6/100
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━

In [237]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict probabilities
y_prob = model.predict(X_test).flatten()

# Convert to class labels (threshold = 0.5)
y_pred = (y_prob > 0.5).astype(int)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print("Deep Learning Accuracy:", round(acc * 100, 2), "%")

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step  
Deep Learning Accuracy: 91.52 %

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       224
           1       0.90      0.93      0.92       224

    accuracy                           0.92       448
   macro avg       0.92      0.92      0.92       448
weighted avg       0.92      0.92      0.92       448


Confusion Matrix:
[[201  23]
 [ 15 209]]


In [238]:
# Deep predictions
dl_probs = model.predict(X_test).flatten()

# Retrain Random Forest with the current X_train (40 features)
rf_ensemble = RandomForestClassifier(n_estimators=300, random_state=42)
rf_ensemble.fit(X_train, y_train)

# Random Forest predictions
rf_probs = rf_ensemble.predict_proba(X_test)[:,1]

# Average ensemble
final_probs = (dl_probs + rf_probs) / 2
final_preds = (final_probs > 0.5).astype(int)

from sklearn.metrics import accuracy_score
print("Ensemble Accuracy:", accuracy_score(y_test, final_preds))

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Ensemble Accuracy: 0.9241071428571429


In [240]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

results = []

# Retrain classical ML models on the current 40-feature X_train
# This ensures consistency with the X_test derived from the 40-feature X_aug
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

lr_retrained = LogisticRegression(max_iter=2000)
lr_retrained.fit(X_train, y_train)

svm_retrained = SVC(kernel="rbf", probability=True)
svm_retrained.fit(X_train, y_train)

rf_retrained = RandomForestClassifier(n_estimators=300, random_state=42)
rf_retrained.fit(X_train, y_train)

ml_models = {
    "Logistic Regression": lr_retrained,
    "SVM": svm_retrained,
    "Random Forest": rf_retrained
}

for name, m in ml_models.items():
    y_pred = m.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

# ===== DEEP LEARNING MODEL =====
y_prob_dl = model.predict(X_test).flatten()
y_pred_dl = (y_prob_dl > 0.5).astype(int)

results.append({
    "Model": "Deep Learning",
    "Accuracy": accuracy_score(y_test, y_pred_dl),
    "Precision": precision_score(y_test, y_pred_dl),
    "Recall": recall_score(y_test, y_pred_dl),
    "F1 Score": f1_score(y_test, y_pred_dl)
})

# ===== ENSEMBLE (AVERAGE VOTING) ====
y_prob_rf = rf_retrained.predict_proba(X_test)[:,1]

ensemble_prob = (y_prob_rf + y_prob_dl) / 2
y_pred_ens = (ensemble_prob > 0.5).astype(int)

results.append({
    "Model": "Ensemble Voting",
    "Accuracy": accuracy_score(y_test, y_pred_ens),
    "Precision": precision_score(y_test, y_pred_ens),
    "Recall": recall_score(y_test, y_pred_ens),
    "F1 Score": f1_score(y_test, y_pred_ens)
})

# ===== SHOW TABLE =====
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Accuracy", ascending=False)

print("\n🏆 MODEL COMPARISON")
print(results_df.round(4))

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 

🏆 MODEL COMPARISON
                 Model  Accuracy  Precision  Recall  F1 Score
4      Ensemble Voting    0.9241     0.9060  0.9464    0.9258
3        Deep Learning    0.9152     0.9009  0.9330    0.9167
2        Random Forest    0.9085     0.8797  0.9464    0.9118
1                  SVM    0.8125     0.8241  0.7946    0.8091
0  Logistic Regression    0.5915     0.5887  0.6071    0.5978
