<a href="https://colab.research.google.com/github/AnasUtman-tech/AnasUtman-tech/blob/main/alzheimers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytorch-tabular imbalanced-learn



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from google.colab import files
uploaded = files.upload()

Saving alzheimers_disease_data.csv to alzheimers_disease_data.csv


In [6]:
data = pd.read_csv('alzheimers_disease_data.csv')
print(data.head())  # View first 5 rows
print(data.info())  # Check data types and missing values
print(data.shape)   # Check number of rows and columns

   PatientID  Age  Gender  Ethnicity  EducationLevel        BMI  Smoking  \
0       4751   73       0          0               2  22.927749        0   
1       4752   89       0          0               0  26.827681        0   
2       4753   73       0          3               1  17.795882        0   
3       4754   74       1          0               1  33.800817        1   
4       4755   89       0          0               0  20.716974        0   

   AlcoholConsumption  PhysicalActivity  DietQuality  ...  MemoryComplaints  \
0           13.297218          6.327112     1.347214  ...                 0   
1            4.542524          7.619885     0.518767  ...                 0   
2           19.555085          7.844988     1.826335  ...                 0   
3           12.209266          8.428001     7.435604  ...                 0   
4           18.454356          6.310461     0.795498  ...                 0   

   BehavioralProblems       ADL  Confusion  Disorientation  \
0     

In [7]:
print("Missing Values:\n", data.isnull().sum())
print("Duplicates:", data.duplicated().sum())

Missing Values:
 PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTa

In [8]:
data = data.drop(['PatientID', 'DoctorInCharge'], axis=1)

In [9]:
X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']

In [10]:
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: ['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion', 'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks', 'Forgetfulness']
Categorical Columns: []


In [11]:
from sklearn.impute import KNNImputer

# KNN imputation for numerical features
imputer = KNNImputer(n_neighbors=5)
X[numerical_cols] = imputer.fit_transform(X[numerical_cols])

# Mode imputation for categorical features (if any)
for col in categorical_cols:
    X[col].fillna(X[col].mode()[0], inplace=True)

In [14]:
from sklearn.preprocessing import OneHotEncoder

# Check if there are any categorical columns to encode
if len(categorical_cols) > 0:
    # Remove the 'sparse' argument
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore') # sparse=False is deprecated in newer versions, use handle_unknown='ignore' if you encounter problems
    encoded_cols = pd.DataFrame(encoder.fit_transform(X[categorical_cols]).toarray()) # Convert to dense array
    encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
    X = pd.concat([X.drop(categorical_cols, axis=1), encoded_cols], axis=1)
else:
    print("No categorical columns to encode.")

No categorical columns to encode.


In [15]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# RobustScaler for outlier handling
robust_scaler = RobustScaler()
X[numerical_cols] = robust_scaler.fit_transform(X[numerical_cols])

# StandardScaler for SVM and DNN
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [16]:
X['HealthRiskIndex'] = X['BMI'] + X['CholesterolTotal'] + X['SystolicBP']
X['Age_MMSE'] = X['Age'] * X['MMSE']

In [17]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(score_func=mutual_info_classif, k=15)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()
print("Selected Features:", selected_features)
X = pd.DataFrame(X_selected, columns=selected_features)

Selected Features: ['EducationLevel', 'AlcoholConsumption', 'SleepQuality', 'CardiovascularDisease', 'CholesterolHDL', 'MMSE', 'FunctionalAssessment', 'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion', 'Disorientation', 'DifficultyCompletingTasks', 'HealthRiskIndex', 'Age_MMSE']


In [18]:
from imblearn.over_sampling import SMOTE

print("Class Distribution:\n", y.value_counts())
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)
print("Balanced Class Distribution:\n", pd.Series(y_balanced).value_counts())

Class Distribution:
 Diagnosis
0    1389
1     760
Name: count, dtype: int64
Balanced Class Distribution:
 Diagnosis
0    1389
1    1389
Name: count, dtype: int64


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42)
print("Training Shape:", X_train.shape)
print("Testing Shape:", X_test.shape)

Training Shape: (2222, 15)
Testing Shape: (556, 15)


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
grid_search_rf = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

y_pred_rf = grid_search_rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9550359712230215
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       278
           1       0.97      0.94      0.95       278

    accuracy                           0.96       556
   macro avg       0.96      0.96      0.96       556
weighted avg       0.96      0.96      0.96       556



In [23]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', random_state=42, probability=True)
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}
grid_search_svm = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

y_pred_svm = grid_search_svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.8884892086330936
              precision    recall  f1-score   support

           0       0.92      0.86      0.88       278
           1       0.86      0.92      0.89       278

    accuracy                           0.89       556
   macro avg       0.89      0.89      0.89       556
weighted avg       0.89      0.89      0.89       556

