In [1]:
## 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib

## 2. Load Data
file_path = 'dataset/alzheimers_disease_data.csv'
df = pd.read_csv(file_path)
print("Data Shape:", df.shape)
df.head()

## 3. Data Cleaning
if 'PatientID' in df.columns:
    df = df.drop('PatientID', axis=1)
df = df.dropna()
print("After cleaning:", df.shape)

## 4. Preprocessing (Encoding + Scaling)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## 6. Define Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

## 7. Train and Evaluate
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append([name, acc, prec, rec, f1])
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))

results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df.sort_values(by='Accuracy', ascending=False)

## 8. Save Best Model (Random Forest)
best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train, y_train)
joblib.dump(best_model, 'best_random_forest_model.pkl')
print("\n✅ Best model (Random Forest) saved as best_random_forest_model.pkl")

Data Shape: (2149, 35)
After cleaning: (2149, 34)

=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       277
           1       0.79      0.71      0.75       153

    accuracy                           0.83       430
   macro avg       0.82      0.80      0.81       430
weighted avg       0.83      0.83      0.83       430


=== Decision Tree ===
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       277
           1       0.87      0.82      0.85       153

    accuracy                           0.89       430
   macro avg       0.89      0.88      0.88       430
weighted avg       0.89      0.89      0.89       430


=== Random Forest ===
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       277
           1       0.96      0.82      0.88       153

    accuracy                           0.92       430
   

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib

## 2. Load Data
file_path = 'dataset/alzheimers_disease_data.csv'
df = pd.read_csv(file_path)
print("Data Shape:", df.shape)
df.head()

## 3. Data Cleaning
if 'PatientID' in df.columns:
    df = df.drop('PatientID', axis=1)
df = df.dropna()
print("After cleaning:", df.shape)

## 4. Preprocessing (Encoding + Scaling)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## 6. Define Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

## 7. Train and Evaluate
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append([name, acc, prec, rec, f1])
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))

results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print("\nModel Comparison:")
print(results_df.sort_values(by='Accuracy', ascending=False))

## 8. Save Best Model (Random Forest)
best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train, y_train)
joblib.dump(best_model, 'best_random_forest_model.pkl')
print("\n✅ Best model (Random Forest) saved as best_random_forest_model.pkl")

## 9. Test Saved Model on New Data (Auto-handles missing encoders)
print("\n🔍 Testing the saved model...")

# Load saved model
loaded_model = joblib.load('best_random_forest_model.pkl')

# Get training columns (for alignment)
training_columns = X.columns

# Create sample data (adjust values according to your dataset’s actual columns)
sample_data = {col: 0 for col in training_columns}  # start with zeros for all
# Fill only existing features safely
for col in training_columns:
    if 'Age' in col:
        sample_data[col] = 73
    elif 'BMI' in col:
        sample_data[col] = 22.9277
    elif 'Blood' in col:
        sample_data[col] = 130
    elif 'Cholesterol' in col:
        sample_data[col] = 210
    elif 'Cognitive' in col:
        sample_data[col] = 45
    elif 'Memory' in col:
        sample_data[col] = 1
    elif 'Forget' in col:
        sample_data[col] = 1

# Convert to DataFrame
sample_df = pd.DataFrame([sample_data])

# Scale using same scaler
sample_scaled = scaler.transform(sample_df)

# Predict
prediction = loaded_model.predict(sample_scaled)[0]

# Handle decoding if label encoder exists for Diagnosis
if 'Diagnosis' in label_encoders:
    pred_label = label_encoders['Diagnosis'].inverse_transform([prediction])[0]
else:
    pred_label = prediction

print("\nPredicted Diagnosis:", pred_label)



Data Shape: (2149, 35)
After cleaning: (2149, 34)

=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       277
           1       0.79      0.71      0.75       153

    accuracy                           0.83       430
   macro avg       0.82      0.80      0.81       430
weighted avg       0.83      0.83      0.83       430


=== Decision Tree ===
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       277
           1       0.87      0.82      0.85       153

    accuracy                           0.89       430
   macro avg       0.89      0.88      0.88       430
weighted avg       0.89      0.89      0.89       430


=== Random Forest ===
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       277
           1       0.96      0.82      0.88       153

    accuracy                           0.92       430
   