In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
import joblib # For saving models
import yaml # For config file loading
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
with open(r"C:\Users\bhuva\Desktop\Alziemer\config.yaml", "r") as f:
    config = yaml.safe_load(f)
PROCESSED_DATA_DIR = config["processed_data_dir"] 
MODEL_DIR = config["model_dir"] 
RANDOM_STATE = config["random_state"]
METRICS_FILE = config["metrics_file"] 
HYPERPARAMETER_TUNING = config.get("hyperparameter_tuning", True)


In [3]:
logistic_regression_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
random_forest_param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15, None]}
xgboost_param_grid = {'n_estimators': [100, 200], 'max_depth': [3, 6], 'learning_rate': [0.01, 0.1]}

In [4]:
def train_and_evaluate_model(X_train, y_train, X_val, y_val, X_test, y_test, model_type, param_grid=None, random_state=42):
    if model_type == 'random_forest':
        model = RandomForestClassifier(random_state=random_state)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    if HYPERPARAMETER_TUNING and param_grid: 
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1) 
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print(f"Best hyperparameters for {model_type}: {grid_search.best_params_}")
        model = best_model
    else:
        model.fit(X_train, y_train) 

    # --- Evaluation ---
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    y_val_prob = model.predict_proba(X_val)[:, 1] # Probabilities for ROC AUC
    y_test_prob = model.predict_proba(X_test)[:, 1]

    print(f"\n--- {model_type.upper()} ---")
    print("\nValidation Set Performance:")
    print(classification_report(y_val, y_val_pred)) 
    print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
    print(f"Validation AUC-ROC: {roc_auc_score(y_val, y_val_prob):.4f}")

    print("\nTest Set Performance:")
    print(classification_report(y_test, y_test_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Test AUC-ROC: {roc_auc_score(y_test, y_test_prob):.4f}")

    # --- ROC Curve Plot ---
    fpr, tpr, thresholds = roc_curve(y_test, y_test_prob)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc_score(y_test, y_test_prob):.2f})')
    plt.plot([0, 1], [0, 1], 'k--') 
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_type.upper()}')
    plt.legend(loc="lower right")
    plt.savefig(f"C:/Users/bhuva/Desktop/Alziemer/reports/{model_type}_roc_curve.png") #ROC curve plot in reports directory 
    plt.close() # Close plot to free memory

    # --- Save Metrics to File ---
    metrics = {
        f"{model_type}_validation_accuracy": accuracy_score(y_val, y_val_pred),
        f"{model_type}_validation_auc_roc": roc_auc_score(y_val, y_val_prob),
        f"{model_type}_test_accuracy": accuracy_score(y_test, y_test_pred),
        f"{model_type}_test_auc_roc": roc_auc_score(y_test, y_test_prob),
    }
    return model, metrics # Return trained model and metrics dictionary

In [5]:
if __name__ == "__main__":
    # Load processed data 
    X_train_processed = pd.read_csv(f"{PROCESSED_DATA_DIR}/X_train_processed.csv")
    X_val_processed = pd.read_csv(f"{PROCESSED_DATA_DIR}/X_val_processed.csv")
    X_test_processed = pd.read_csv(f"{PROCESSED_DATA_DIR}/X_test_processed.csv")
    y_train = pd.read_csv(f"{PROCESSED_DATA_DIR}/y_train.csv")
    y_val = pd.read_csv(f"{PROCESSED_DATA_DIR}/y_val.csv")
    y_test = pd.read_csv(f"{PROCESSED_DATA_DIR}/y_test.csv")
    y_train = y_train.squeeze() # Convert DataFrame to Series
    y_val = y_val.squeeze()
    y_test = y_test.squeeze()


    # Train and Evaluate Models & Save 
    trained_models = {}
    all_metrics = {}

    # Random Forest
    rf_model, rf_metrics = train_and_evaluate_model(X_train_processed, y_train, X_val_processed, y_val, X_test_processed, y_test,
                                                    model_type='random_forest', param_grid=random_forest_param_grid if HYPERPARAMETER_TUNING else None, random_state=RANDOM_STATE)
    trained_models['random_forest'] = rf_model
    all_metrics.update(rf_metrics)



    # Save Trained Models 
    joblib.dump(trained_models, f"{MODEL_DIR}/trained_models.joblib")
    print("Trained models saved to 'models/trained_models.joblib'")

    # Save Metrics to JSON 
    import json
    with open(METRICS_FILE, 'w') as outfile:
        json.dump(all_metrics, outfile, indent=4)
    print(f"Model metrics saved to '{METRICS_FILE}'")

Best hyperparameters for random_forest: {'max_depth': 15, 'n_estimators': 300}

--- RANDOM_FOREST ---

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.74      0.79      0.77      6535
           1       0.67      0.62      0.64      4607

    accuracy                           0.72     11142
   macro avg       0.71      0.70      0.70     11142
weighted avg       0.71      0.72      0.71     11142

Validation Accuracy: 0.7167
Validation AUC-ROC: 0.7946

Test Set Performance:
              precision    recall  f1-score   support

           0       0.75      0.79      0.77      6536
           1       0.68      0.63      0.66      4607

    accuracy                           0.73     11143
   macro avg       0.72      0.71      0.72     11143
weighted avg       0.73      0.73      0.73     11143

Test Accuracy: 0.7275
Test AUC-ROC: 0.8022
Trained models saved to 'models/trained_models.joblib'
Model metrics saved to 'C:/Users/bhuva/