In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'C:/Users/SANEABHUTTO/Desktop/archive/archive (1)/alzheimers_disease_data.csv'
alzheimers_data = pd.read_csv(file_path)

# Feature Selection using RandomForest feature importance
X = alzheimers_data.drop(columns=['Diagnosis', 'PatientID'])
y = alzheimers_data['Diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest to get feature importances
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.DataFrame(rf.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

# Select top features (you can adjust the number of features)
top_features = feature_importances.head(20).index.tolist()

# Use only the top features for model training
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# Define the models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Function to calculate and print classification report for each model
def evaluate_models(models, X_train, y_train, X_test, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        print(f'Confusion Matrix for {name}:\n{cm}')
        report = classification_report(y_test, y_pred, output_dict=True)
        results[name] = report
        print(f'Classification Report for {name}:\n{classification_report(y_test, y_pred)}\n')
    return results

# Evaluate all models
results = evaluate_models(models, X_train_top, y_train, X_test_top, y_test)

# Function to compare and find the best model based on a specific metric
def find_best_model(results, metric='f1-score', average='weighted'):
    best_model = None
    best_score = 0
    for model, scores in results.items():
        score = scores['weighted avg'][metric]
        if score > best_score:
            best_score = score
            best_model = model
    return best_model, best_score

# Find the best model based on weighted F1-score
best_model, best_score = find_best_model(results)
print(f'The best model is {best_model} with a weighted F1-score of {best_score:.2f}')


Confusion Matrix for RandomForest:
[[272   5]
 [ 24 129]]
Classification Report for RandomForest:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       277
           1       0.96      0.84      0.90       153

    accuracy                           0.93       430
   macro avg       0.94      0.91      0.92       430
weighted avg       0.93      0.93      0.93       430


Confusion Matrix for GradientBoosting:
[[271   6]
 [ 12 141]]
Classification Report for GradientBoosting:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       277
           1       0.96      0.92      0.94       153

    accuracy                           0.96       430
   macro avg       0.96      0.95      0.95       430
weighted avg       0.96      0.96      0.96       430






Confusion Matrix for AdaBoost:
[[261  16]
 [ 22 131]]
Classification Report for AdaBoost:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       277
           1       0.89      0.86      0.87       153

    accuracy                           0.91       430
   macro avg       0.91      0.90      0.90       430
weighted avg       0.91      0.91      0.91       430


Confusion Matrix for XGBoost:
[[271   6]
 [ 17 136]]
Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       277
           1       0.96      0.89      0.92       153

    accuracy                           0.95       430
   macro avg       0.95      0.93      0.94       430
weighted avg       0.95      0.95      0.95       430


The best model is GradientBoosting with a weighted F1-score of 0.96
