In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle

# Load the Alzheimer's disease dataset
alzheimers_data = pd.read_csv('alzheimers_disease_data.csv')

# Display the first few rows of the dataset
print(alzheimers_data.head())

   PatientID  Age  Gender  Ethnicity  EducationLevel        BMI  Smoking  \
0       4751   73       0          0               2  22.927749        0   
1       4752   89       0          0               0  26.827681        0   
2       4753   73       0          3               1  17.795882        0   
3       4754   74       1          0               1  33.800817        1   
4       4755   89       0          0               0  20.716974        0   

   AlcoholConsumption  PhysicalActivity  DietQuality  ...  MemoryComplaints  \
0           13.297218          6.327112     1.347214  ...                 0   
1            4.542524          7.619885     0.518767  ...                 0   
2           19.555085          7.844988     1.826335  ...                 0   
3           12.209266          8.428001     7.435604  ...                 0   
4           18.454356          6.310461     0.795498  ...                 0   

   BehavioralProblems       ADL  Confusion  Disorientation  \
0     

In [3]:
# Check for missing values
print(alzheimers_data.isnull().sum())


PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfu

In [4]:
# Drop rows with missing values (or you can impute them)
alzheimers_data = alzheimers_data.dropna()

In [5]:
# Encode categorical variables if necessary
alzheimers_data = pd.get_dummies(alzheimers_data, drop_first=True)

In [6]:
print(alzheimers_data.columns)

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
       'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
       'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP',
       'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'Diagnosis'],
      dtype='object')


In [7]:
# Display the dataset after preprocessing
print(alzheimers_data.head())

   PatientID  Age  Gender  Ethnicity  EducationLevel        BMI  Smoking  \
0       4751   73       0          0               2  22.927749        0   
1       4752   89       0          0               0  26.827681        0   
2       4753   73       0          3               1  17.795882        0   
3       4754   74       1          0               1  33.800817        1   
4       4755   89       0          0               0  20.716974        0   

   AlcoholConsumption  PhysicalActivity  DietQuality  ...  \
0           13.297218          6.327112     1.347214  ...   
1            4.542524          7.619885     0.518767  ...   
2           19.555085          7.844988     1.826335  ...   
3           12.209266          8.428001     7.435604  ...   
4           18.454356          6.310461     0.795498  ...   

   FunctionalAssessment  MemoryComplaints  BehavioralProblems       ADL  \
0              6.518877                 0                   0  1.725883   
1              7.118696   

In [15]:
# Separate features and target variable
X = alzheimers_data.drop('Diagnosis', axis=1)  # 'Diagnosis' as the target column name
y = alzheimers_data['Diagnosis']  # 'Diagnosis' as the target column name
# Save the feature names for use in the prediction script
with open('alzheimers_features.pkl', 'wb') as f:  # Save to a file
    pickle.dump(X.columns, f)
# Display the features and target variable
print("Features (X):")
print(X.head())
print("Target (y):")
print(y.head())


Features (X):
   PatientID  Age  Gender  Ethnicity  EducationLevel        BMI  Smoking  \
0       4751   73       0          0               2  22.927749        0   
1       4752   89       0          0               0  26.827681        0   
2       4753   73       0          3               1  17.795882        0   
3       4754   74       1          0               1  33.800817        1   
4       4755   89       0          0               0  20.716974        0   

   AlcoholConsumption  PhysicalActivity  DietQuality  ...       MMSE  \
0           13.297218          6.327112     1.347214  ...  21.463532   
1            4.542524          7.619885     0.518767  ...  20.613267   
2           19.555085          7.844988     1.826335  ...   7.356249   
3           12.209266          8.428001     7.435604  ...  13.991127   
4           18.454356          6.310461     0.795498  ...  13.517609   

   FunctionalAssessment  MemoryComplaints  BehavioralProblems       ADL  \
0              6.5188

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (1719, 33) (1719,)
Testing set shape: (430, 33) (430,)


In [17]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the scaled features
print("Scaled training features:")
print(X_train_scaled[:5])
print("Scaled testing features:")
print(X_test_scaled[:5])


Scaled training features:
[[ 0.58268042  1.32852308  0.99941844  1.30905803 -0.31336327  0.01065932
   1.57696399  1.13666856 -1.64897284  0.32795912  0.45404862 -0.57667854
  -0.39891965 -0.42881732 -0.51197123 -0.33236293 -0.42405371  1.18738838
   0.82003274 -1.69123942 -0.98513025 -0.97510938 -0.71046627  1.23439164
  -0.68665172 -0.509255   -0.43071712  0.56772702 -0.50199892 -0.4345074
  -0.41734917 -0.43922839 -0.64948921]
 [-0.71108324 -0.54736743 -1.0005819  -0.70190779  1.89883988  1.30582655
  -0.63412989 -1.49210597  1.5215992  -1.0872222  -1.03546622  1.73406834
  -0.39891965 -0.42881732 -0.51197123 -0.33236293 -0.42405371  0.37588643
   0.2006599  -1.21969766  1.3171018   0.03844378  0.53494262 -0.74632357
   0.18934908 -0.509255    2.32170941 -0.36860359 -0.50199892 -0.4345074
  -0.41734917 -0.43922839  1.53967147]
 [-1.60044506  0.77679057  0.99941844  2.31454094  0.7927383  -1.63435929
  -0.63412989  1.09959265 -1.07068297  0.56377158 -0.50742496 -0.57667854
   2.50677

In [18]:
# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)


In [19]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.93
Precision: 0.93
Recall: 0.93
F1 Score: 0.93


In [20]:
# Save the model to a .sav file
with open('alzheimers_model.sav', 'wb') as model_file:
    pickle.dump(model, model_file)          


In [22]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler to a .pkl file
with open('alzheimers_scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)