# Exploring the Alzheimer's prediction dataset from [kaggle](https://www.kaggle.com/datasets/ankushpanday1/alzheimers-prediction-dataset-global)

### Loading the Dataset

In [None]:
import pandas as pd

alzheimers = pd.read_csv("data/alzheimers.csv")

### Summary Statistics

In [4]:
display(alzheimers.describe().style.set_caption("Numerical Columns"))
display(alzheimers.select_dtypes(include="object").describe().style.set_caption("Categorical Columns"))
display(alzheimers.info())

Unnamed: 0,Age,Education Level,BMI,Cognitive Test Score
count,74283.0,74283.0,74283.0,74283.0
mean,71.964703,9.487514,26.780639,64.654241
std,12.980748,5.75702,4.764679,20.153247
min,50.0,0.0,18.5,30.0
25%,61.0,4.0,22.7,47.0
50%,72.0,9.0,26.8,65.0
75%,83.0,14.0,30.9,82.0
max,94.0,19.0,35.0,99.0


Unnamed: 0,Country,Gender,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer’s Diagnosis
count,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283,74283
unique,20,2,3,3,3,2,2,2,2,3,3,3,3,3,3,2,3,3,3,2,2
top,Brazil,Female,High,Current,Never,No,No,Normal,No,Medium,Good,Average,High,Unemployed,Single,No,Medium,Low,Medium,Rural,No
freq,3839,37249,24853,24915,24865,59527,52134,51973,52004,24843,25145,24917,24906,24801,25169,59561,24859,24873,24886,37203,43570


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74283 entries, 0 to 74282
Data columns (total 25 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               74283 non-null  object 
 1   Age                                   74283 non-null  int64  
 2   Gender                                74283 non-null  object 
 3   Education Level                       74283 non-null  int64  
 4   BMI                                   74283 non-null  float64
 5   Physical Activity Level               74283 non-null  object 
 6   Smoking Status                        74283 non-null  object 
 7   Alcohol Consumption                   74283 non-null  object 
 8   Diabetes                              74283 non-null  object 
 9   Hypertension                          74283 non-null  object 
 10  Cholesterol Level                     74283 non-null  object 
 11  Family History 

None

This dataset appears to be very clean.
- No missing values, the row counts for each attribute remain consistent for all.
- Data types appear as expected.
- Frequency counts for categorical variables show a good distribution for each.

### Univariate Analysis (TODO)

# Preprocessing and Feature Engineering

In [None]:
# Standardization of numerical features
from sklearn.preprocessing import StandardScaler

def standardize_numerical_columns(data: pd.DataFrame) -> pd.DataFrame:
    numerical_columns = data.select_dtypes(include = "number")
    data[numerical_columns.columns] = StandardScaler().fit_transform(numerical_columns)
    return data

alz_standardized = standardize_numerical_columns(alzheimers)
display(alzheimers.describe().style.set_caption("Numerical Columns"))

Unnamed: 0,Age,Education Level,BMI,Cognitive Test Score
count,74283.0,74283.0,74283.0,74283.0
mean,0.0,0.0,-0.0,-0.0
std,1.000007,1.000007,1.000007,1.000007
min,-1.69211,-1.648002,-1.737933,-1.719548
25%,-0.844695,-0.953193,-0.856441,-0.876006
50%,0.002719,-0.084682,0.004063,0.017157
75%,0.850134,0.783828,0.864568,0.860699
max,1.697548,1.652339,1.725072,1.704241


In [None]:
# TODO Preprocess the categorial columns:
# Potentially: Ordinal encode the "ordered" attributes EX: Depression Level -> {'Low': 0, 'Medium': 1, 'High': 2}
# Label encode the nominal binary stuff EX: Gender -> {'Male': 0, 'Female': 1}
# One hot encode / frequency encode / target encode for other nominal attributes
# Balance our target variable. Currently using stratified k-fold to compensate

# Modeling

### CV strategy functions

Currently only using basic train test split validation. TODO: implement more comprehensive k-fold strategies, such as stratified k-fold.

In [9]:
from sklearn.model_selection import train_test_split
import numpy as np

def basic_tt_split_validation(X, y, model, train_size = 0.80):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = train_size)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = np.sum(predictions == y_test) / predictions.__len__()
    return accuracy

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
import time

start_time = time.perf_counter()

rf_classifier = RandomForestClassifier()

# Currently just using numerical values
numerical_data = alz_standardized[alz_standardized.select_dtypes(include='number').columns]
target = alz_standardized['Alzheimer’s Diagnosis']
accuracy = basic_tt_split_validation(numerical_data, target, rf_classifier)

end_time = time.perf_counter()

print("Accuracy of RF model:", accuracy)
print("Time taken:", end_time - start_time, "(s)")

Accuracy of RF model: 0.6821700208655852
Time taken: 12.682744299992919 (s)
