# PHASE 4: MODELING

## MODEL#1 RANDOM FOREST

#### This model was chosen due to its stability and because it handles non linear data well

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

data = pd.read_csv("alzheimers_disease_data.csv")
data.drop(['PatientID', 'DoctorInCharge'], axis=1, inplace=True)
# -----------------------------
# 2. Separate features + target
# -----------------------------
X = data.drop("Diagnosis", axis=1)   # all features
y = data["Diagnosis"]                # target

# If Diagnosis is categorical, convert to numeric
if y.dtype == 'object':
    y = y.astype('category').cat.codes

# -----------------------------
# 3. Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# -----------------------------
# 4. Build Random Forest model
# -----------------------------
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42
)

# Train the model
rf.fit(X_train, y_train)

# -----------------------------
# 5. Predictions
# -----------------------------
y_pred = rf.predict(X_test)

# -----------------------------
# 6. Evaluation
# -----------------------------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9441860465116279

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       278
           1       0.94      0.89      0.92       152

    accuracy                           0.94       430
   macro avg       0.94      0.93      0.94       430
weighted avg       0.94      0.94      0.94       430


Confusion Matrix:
[[270   8]
 [ 16 136]]


## MODEL#2 XG BOOST

#### This model was chosen because it is what is commonly used in smaller medical datasets since it handles mixed numerical and categorical data well

In [41]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------------------------------
# 1. Prepare data (using 'data', not df)
# -----------------------------------------------------

# Identify target column
target_col = "Diagnosis"   # change if your target has a different name

# Encode target labels if they are non-numeric
le = LabelEncoder()
y = le.fit_transform(data[target_col])

# Features = everything except the target
X = data.drop(columns=[target_col])

# If categorical features exist, convert them to numeric using one-hot encoding
X = pd.get_dummies(X)

# -----------------------------------------------------
# 2. Train-test split
# -----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------------------------------
# 3. XGBoost Model
# -----------------------------------------------------
model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)

# Train
model.fit(X_train, y_train)

# -----------------------------------------------------
# 4. Predictions
# -----------------------------------------------------
y_pred = model.predict(X_test)

# -----------------------------------------------------
# 5. Evaluation
# -----------------------------------------------------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.9534883720930233

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       277
           1       0.96      0.91      0.93       153

    accuracy                           0.95       430
   macro avg       0.95      0.94      0.95       430
weighted avg       0.95      0.95      0.95       430


Confusion Matrix:
[[271   6]
 [ 14 139]]


## MODEL#3 LOGISTIC REGRESSION

#### This model was chosen for its simpleness and interpretability and is also used in medical datasets

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------------------------------
# 1. Prepare data (using 'data')
# -----------------------------------------------------

target_col = "Diagnosis"   # change if your target has another name

# Encode target if it is categorical
le = LabelEncoder()
y = le.fit_transform(data[target_col])

# Features = everything except the target
X = data.drop(columns=[target_col])

# Convert categorical predictors to numeric (one-hot encoding)
X = pd.get_dummies(X)

# -----------------------------------------------------
# 2. Train-test split
# -----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------------------------------
# 3. Logistic Regression Model
# -----------------------------------------------------
log_reg = LogisticRegression(
    max_iter=500,
    solver='lbfgs'
)

# Train model
log_reg.fit(X_train, y_train)

# -----------------------------------------------------
# 4. Predictions
# -----------------------------------------------------
y_pred = log_reg.predict(X_test)

# -----------------------------------------------------
# 5. Evaluation
# -----------------------------------------------------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.8302325581395349

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       277
           1       0.78      0.73      0.75       153

    accuracy                           0.83       430
   macro avg       0.82      0.81      0.81       430
weighted avg       0.83      0.83      0.83       430


Confusion Matrix:
[[246  31]
 [ 42 111]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## MODEL#4 SVM

#### This model was chosen because its good for small datasets and effective with scaled features

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------------------------------
# 1. Prepare the data
# -----------------------------------------------------

target_col = "Diagnosis"   # Change if your target has another name

# Encode target labels if non-numeric
le = LabelEncoder()
y = le.fit_transform(data[target_col])

# Features (except target)
X = data.drop(columns=[target_col])

# One-hot encode categorical features
X = pd.get_dummies(X)

# -----------------------------------------------------
# 2. Train-test split
# -----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------------------------------
# 3. Scaling (VERY important for SVM)
# -----------------------------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -----------------------------------------------------
# 4. SVM Model (RBF kernel)
# -----------------------------------------------------
svm_clf = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    probability=True
)

# Train the model
svm_clf.fit(X_train, y_train)

# -----------------------------------------------------
# 5. Predictions
# -----------------------------------------------------
y_pred = svm_clf.predict(X_test)

# -----------------------------------------------------
# 6. Evaluation
# -----------------------------------------------------
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)


Accuracy: 0.8325581395348837

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.87       277
           1       0.80      0.71      0.75       153

    accuracy                           0.83       430
   macro avg       0.82      0.80      0.81       430
weighted avg       0.83      0.83      0.83       430


Confusion Matrix:
[[250  27]
 [ 45 108]]


#### From the above 4 models' comparison, it can be seen that model 2, XG BOOST offers the best predictive capabilities with an accuracy of 95% predicting 271 true negatives 6 false positives which is a false alarm but medically less dangerous than getting a false negative, 14 false negatives and 139 true posistives. 

# PHASE 5: EVALUATION

####  MODEL             ACCURACY     RECALL(0)     RECALL(1)     CONFUSION MATRIX
#### RANDOM FOREST        94.42%        0.97          0.89          [270,  8]
####      >                                                         [16 ,136]
#### XG BOOST             95.35%        0.98          0.91          [271,  6]
####                                                                [14 ,139]
#### LOGISTIC REGRESSION  83.02%        0.89          0.73          [246, 31]
####                                                                [42 ,111]
#### SVM                  83.26%        0.90          0.71          [250, 27]
####                                                                [45 ,108]


# PHASE 6: DEPLOYMENT

## XG BOOST MODEL FOR TRAINING AND TESTING

#### This train-test split used 80% train since that is standard and usually enough for a model to learn patterns and 20% test since that is a sufficient unknown portion which can be utilised to test the model 

In [51]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

# SELECT FEATURES
FEATURE_COLUMNS = [
    "MMSE",
    "FunctionalAssessment",
    "MemoryComplaints",
    "BehavioralProblems",
    "ADL"
]

TARGET_COL = "Diagnosis"

le = LabelEncoder()
y = le.fit_transform(data[TARGET_COL])


X = data[FEATURE_COLUMNS].copy()

for col in X.columns:
    X[col] = X[col].fillna(X[col].median())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train, y_train)

joblib.dump(model, "alz_model.joblib")
joblib.dump(le, "alz_label_encoder.joblib")
joblib.dump(FEATURE_COLUMNS, "alz_features.joblib")

print("Model trained and saved!")


Model trained and saved!


## ALZEIMERS' PREDICTION TEST 

In [75]:
import pandas as pd
import joblib

def predict_alzheimers_interactive():

    model = joblib.load("alz_model.joblib")
    le = joblib.load("alz_label_encoder.joblib")
    features = joblib.load("alz_features.joblib")

    print("\n========== INPUT SCALE ==========")
    print("MMSE (Mini-Mental State Examination): 0–30")
    print("Functional Assessment: 0–5 ")
    print("Memory Complaints: 0 = No, 1 = Yes")
    print("Behavioral Problems: 0 = No, 1 = Yes")
    print("ADL (Activities of Daily Living): 0–6 ")
    print("=================================\n")

    print("Enter the following values:")
    mmse = float(input("MMSE score: "))
    functional_assessment = float(input("Functional Assessment score: "))
    memory_complaints = float(input("Memory Complaints (0 = No, 1 = Yes): "))
    behavioral_problems = float(input("Behavioral Problems (0 = No, 1 = Yes): "))
    adl = float(input("ADL score: "))

    row = pd.DataFrame([[
        mmse,
        functional_assessment,
        memory_complaints,
        behavioral_problems,
        adl
    ]], columns=features)

    pred_num = int(model.predict(row)[0])

    label_map = {
        0: "Not Alzheimer's",
        1: "Alzheimer's"
    }
    pred_label = label_map.get(pred_num, "Unknown")

    prob = model.predict_proba(row)[0].max()

    print("\n-----------------------------------")
    print("Prediction:", pred_label)
    print(f"Confidence: {prob:.2f}")
    print("-----------------------------------")

predict_alzheimers_interactive()



MMSE (Mini-Mental State Examination): 0–30
Functional Assessment: 0–5 
Memory Complaints: 0 = No, 1 = Yes
Behavioral Problems: 0 = No, 1 = Yes
ADL (Activities of Daily Living): 0–6 

Enter the following values:


MMSE score:  23
Functional Assessment score:  4
Memory Complaints (0 = No, 1 = Yes):  1
Behavioral Problems (0 = No, 1 = Yes):  1
ADL score:  3



-----------------------------------
Prediction: Alzheimer's
Confidence: 1.00
-----------------------------------
