# Dementia Risk Prediction

## Imports & Setup

In [None]:
# Core libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# ML utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay

import joblib


## Load Dataset

In [None]:
df = pd.read_csv("../data/raw/nacc_hackathon.csv")
df.head()

df.shape
df.info()


## Target Variable

In [None]:
TARGET = "Dementia"
df[TARGET].value_counts()


## Select a Valid Feature Set

In [None]:
medical_features = [
    "MMSE",
    "CDR",
    "ADAS13",
    "Diagnosis",
    "ClinicalScore",
    "BrainVolume"
]

In [None]:
non_medical_features = [
    col for col in df.columns
    if col not in medical_features + [TARGET]
]


In [None]:
len(non_medical_features)


## Exploratory Data Analysis (EDA)

### Target distribution

In [None]:
sns.countplot(x=TARGET, data=df)
plt.title("Dementia vs Non-Dementia Distribution")
plt.show()


### Missing values

In [None]:
df[non_medical_features].isnull().mean().sort_values(ascending=False).head(10)


### Numerical distributions

In [None]:
df[non_medical_features].select_dtypes("number").hist(
    figsize=(15,10), bins=20
)
plt.show()


### Categorical analysis

In [None]:
df[non_medical_features].select_dtypes("object").nunique()


## Feature Engineering & Preprocessing

In [None]:
X = df[non_medical_features]
y = df[TARGET]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


### Preprocessing pipelines

In [None]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns


In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [None]:
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


In [None]:
preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])


## Build Binary Classification Models

### Logistic Regression (baseline)

In [None]:
log_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

log_model.fit(X_train, y_train)


In [None]:
log_probs = log_model.predict_proba(X_test)[:,1]
print("ROC-AUC:", roc_auc_score(y_test, log_probs))


### Random Forest

In [None]:
rf_model = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)


## Evaluate and Improve the Model

In [None]:
rf_probs = rf_model.predict_proba(X_test)[:,1]

print(classification_report(
    y_test,
    rf_model.predict(X_test)
))
print("ROC-AUC:", roc_auc_score(y_test, rf_probs))


In [None]:
RocCurveDisplay.from_estimator(rf_model, X_test, y_test)
plt.show()


## Hyperparameter Tuning

In [None]:
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [5, 10, None]
}


In [None]:
grid = GridSearchCV(
    rf_model,
    param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_


## Explain What the Model Has Learned

In [None]:
feature_names = best_model["preprocess"].get_feature_names_out()
importances = best_model["model"].feature_importances_


In [None]:
importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

importance_df.head(10)


## save trained dementia risk prediction

In [None]:
joblib.dump(best_model, "../data/processed/dementia_risk_model.pkl")


## Predicting a new patient

In [None]:
model = joblib.load("../data/processed/dementia_risk_model.pkl")


In [None]:
new_patient = {
    "Age": 72,
    "Sex": "Female",
    "EducationYears": 10,
    "MaritalStatus": "Married",
    "EmploymentStatus": "Retired",
    "SmokingStatus": "Never",
    "AlcoholUse": "Occasional",
    "LivesAlone": "No",
    "PhysicalActivity": "Low"
}


In [None]:
new_patient_df = pd.DataFrame([new_patient])


In [None]:
risk_probability = model.predict_proba(new_patient_df)[0][1]
risk_percentage = round(risk_probability * 100, 2)

risk_percentage


In [None]:
def interpret_risk(prob):
    if prob < 0.3:
        return "Low risk"
    elif prob < 0.6:
        return "Moderate risk"
    else:
        return "High risk"


In [None]:
risk_level = interpret_risk(risk_probability)

print(f"Dementia Risk Probability: {risk_percentage}%")
print(f"Risk Category: {risk_level}")
