In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

from xgboost import XGBClassifier


In [2]:
data = pd.read_csv("alzheimers_disease_data.csv", encoding="ISO-8859-1")
data.head()


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [3]:
DROP_COLS = ["PatientID", "DoctorInCharge"]
data = data.drop(columns=[c for c in DROP_COLS if c in data.columns])


In [4]:
TARGET = "Diagnosis"

X = data.drop(TARGET, axis=1)
y = data[TARGET]


In [5]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


In [6]:
#num_features = X.select_dtypes(include=["int64", "float64"]).columns
#cat_features = X.select_dtypes(include=["object", "category"]).columns

#print("Numerical:", num_features)
#print("Categorical:", cat_features)
# Explicit column lists (IMPORTANT)

# Explicit categorical columns
categorical_features = [
    "Gender",
    "Smoking",
    "FamilyHistoryAlzheimers",
    "CardiovascularDisease",
    "Diabetes",
    "Depression",
    "HeadInjury",
    "Hypertension",
    "MemoryComplaints",
    "BehavioralProblems"
]

# ðŸ”¥ FORCE categorical dtype FIRST
X[categorical_features] = X[categorical_features].astype(str)

# THEN compute numerical features
numerical_features = [c for c in X.columns if c not in categorical_features]


In [7]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [8]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


In [9]:
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_features),
    ("cat", cat_pipeline, categorical_features)
])


In [10]:
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)


In [11]:
ml_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [13]:
ml_pipeline.fit(X_train, y_train)



In [14]:
y_pred = ml_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.9441860465116279

Classification Report:

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       278
           1       0.94      0.90      0.92       152

    accuracy                           0.94       430
   macro avg       0.94      0.93      0.94       430
weighted avg       0.94      0.94      0.94       430



In [15]:
joblib.dump(ml_pipeline, "alzheimers_pipeline.joblib")
joblib.dump(label_encoder, "label_encoder.joblib")


print("Pipeline saved successfully")


Pipeline saved successfully


In [17]:
ml_pipeline.feature_names_in_


array(['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
       'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
       'Diabetes', 'Depression', 'HeadInjury', 'Hypertension',
       'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL',
       'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE',
       'FunctionalAssessment', 'MemoryComplaints', 'BehavioralProblems',
       'ADL', 'Confusion', 'Disorientation', 'PersonalityChanges',
       'DifficultyCompletingTasks', 'Forgetfulness'], dtype=object)

In [19]:
new_patient_dataframe = pd.DataFrame([{
    "Age": 72,
    "Gender": "Male",
    "Ethnicity": 1,
    "EducationLevel": 3,
    "BMI": 25.4,
    "Smoking": "No",
    "AlcoholConsumption": 1.2,
    "PhysicalActivity": 2.5,
    "DietQuality": 6.8,
    "SleepQuality": 5.9,
    "FamilyHistoryAlzheimers": "Yes",
    "CardiovascularDisease": "No",
    "Diabetes": "No",
    "Depression": "No",
    "HeadInjury": "No",
    "Hypertension": "Yes",
    "SystolicBP": 140,
    "DiastolicBP": 90,
    "CholesterolTotal": 220,
    "CholesterolLDL": 140,
    "CholesterolHDL": 45,
    "CholesterolTriglycerides": 180,
    "DifficultyCompletingTasks": 1,
    "Disorientation": 0,
    "Forgetfulness": 0,
    "Confusion": 1,
    "PersonalityChanges": 1,
    "MMSE": 23,
    "FunctionalAssessment": 6.2,
    "MemoryComplaints": "Yes",
    "BehavioralProblems": "No",
    "ADL": 7.5
}])


In [23]:
prediction = ml_pipeline.predict(new_patient_dataframe)
prediction


array([1])

In [25]:
pipeline = joblib.load("alzheimers_pipeline.joblib")

prediction = pipeline.predict(new_patient_dataframe)
