<a href="https://colab.research.google.com/github/GVSU-CIS635/projects-team-1-1/blob/main/RandomForest2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    classification_report,
    make_scorer
)

from sklearn.preprocessing import label_binarize



**Data load**

In [3]:
print("\n[Step 1: Loading Data]")
df_train = pd.read_csv("https://raw.githubusercontent.com/GVSU-CIS635/projects-team-1-1/main/data/train.csv", sep=";")
df_test = pd.read_csv("https://raw.githubusercontent.com/GVSU-CIS635/projects-team-1-1/main/data/test.csv", sep=";")

print(f"Training data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape}")


[Step 1: Loading Data]
Training data shape: (45211, 17)
Test data shape: (4521, 17)


**Data splitting**

In [4]:
# Split into features & labels
X_train = df_train.drop("y", axis=1)
y_train = df_train["y"]

X_test = df_test.drop("y", axis=1)
y_test = df_test["y"]




** IDENTIFY NUMERIC AND CATEGORICAL COLUMNS**

In [5]:


numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric features ({len(numeric_cols)}): {numeric_cols}")
print(f"Categorical features ({len(categorical_cols)}): {categorical_cols}")



Numeric features (7): ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
Categorical features (9): ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']


**PREPROCESSING**

In [19]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_cols),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ]
)




**RANDOM FOREST PIPELINE**

In [31]:
rf_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
])


rf_param_grid = {
    'clf__n_estimators': [100, 150],          # Moderate number
    'clf__max_depth': [12, 15],               # Limited depth
    'clf__min_samples_split': [10, 15],       # Higher values = less overfitting
    'clf__min_samples_leaf': [5, 7],          # Larger leaves = smoother model
    'clf__max_features': ['sqrt'],
}

print("\n[Hyperparameter Grid]")
for param, values in rf_param_grid.items():
    print(f"  {param}: {values}")
print(f"Total combinations: {len(rf_param_grid['clf__n_estimators']) * len(rf_param_grid['clf__max_depth']) * len(rf_param_grid['clf__min_samples_split'])}")

cv_model = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\n[Running GridSearchCV...]")

import time
start = time.time()

gs_rf = GridSearchCV(
    rf_pipe,
    rf_param_grid,
    cv=cv_model,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

gs_rf.fit(X_train, y_train)
best_rf = gs_rf.best_estimator_

end = time.time()
print(f"\n GridSearchCV completed in {(end-start):.2f} seconds")

print(f"\nBest Hyperparameters:")
for param, value in gs_rf.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest CV F1 Score (weighted): {gs_rf.best_score_:.4f}")




RANDOM FOREST - HYPERPARAMETER TUNING

[Hyperparameter Grid]
  clf__n_estimators: [100, 150]
  clf__max_depth: [12, 15]
  clf__min_samples_split: [10, 15]
  clf__min_samples_leaf: [5, 7]
  clf__max_features: ['sqrt']
Total combinations: 8

[Running GridSearchCV...]
Fitting 5 folds for each of 16 candidates, totalling 80 fits

 GridSearchCV completed in 269.54 seconds

Best Hyperparameters:
  clf__max_depth: 15
  clf__max_features: sqrt
  clf__min_samples_leaf: 5
  clf__min_samples_split: 10
  clf__n_estimators: 150

Best CV F1 Score (weighted): 0.8819


**CROSS-VALIDATION ON TRAINING DATA**


In [32]:

cv_scores = cross_validate(
    best_rf,
    X_train,
    y_train,
    cv=cv_model,
    scoring={
        "accuracy": "accuracy",
        "f1_weighted": "f1_weighted",
        "f1_yes": make_scorer(f1_score, pos_label='yes'),
        "precision_yes": make_scorer(precision_score, pos_label='yes', zero_division=0),
        "recall_yes": make_scorer(recall_score, pos_label='yes'),
        "roc_auc": "roc_auc_ovr_weighted"
    },
    return_train_score=False,
    n_jobs=-1
)

cv_acc_mean = cv_scores["test_accuracy"].mean()
cv_acc_std = cv_scores["test_accuracy"].std()
cv_f1_mean = cv_scores["test_f1_weighted"].mean()
cv_f1_std = cv_scores["test_f1_weighted"].std()
cv_auc_mean = cv_scores["test_roc_auc"].mean()
cv_auc_std = cv_scores["test_roc_auc"].std()
cv_f1_yes_mean = cv_scores["test_f1_yes"].mean()
cv_prec_yes_mean = cv_scores["test_precision_yes"].mean()
cv_rec_yes_mean = cv_scores["test_recall_yes"].mean()

print(f"\nOverall Metrics:")
print(f"  Accuracy:     {cv_acc_mean:.4f} (±{cv_acc_std:.4f})")
print(f"  F1-weighted:  {cv_f1_mean:.4f} (±{cv_f1_std:.4f})")
print(f"  ROC-AUC:      {cv_auc_mean:.4f} (±{cv_auc_std:.4f})")

print(f"\n'Yes' Class Performance:")
print(f"  Precision:    {cv_prec_yes_mean:.4f}")
print(f"  Recall:       {cv_rec_yes_mean:.4f}")
print(f"  F1-score:     {cv_f1_yes_mean:.4f}")



CROSS-VALIDATION RESULTS (5-Fold)

Overall Metrics:
  Accuracy:     0.8665 (±0.0031)
  F1-weighted:  0.8819 (±0.0026)
  ROC-AUC:      0.9284 (±0.0027)

'Yes' Class Performance:
  Precision:    0.4610
  Recall:       0.8315
  F1-score:     0.5931


**FINAL TEST SET EVALUATION**


In [35]:


y_pred = best_rf.predict(X_test)
y_proba = best_rf.predict_proba(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
test_f1_weighted = f1_score(y_test, y_pred, average='weighted')
test_f1_yes = f1_score(y_test, y_pred, pos_label='yes')
test_precision = precision_score(y_test, y_pred, average='weighted')
test_recall = recall_score(y_test, y_pred, average='weighted')
test_precision_yes = precision_score(y_test, y_pred, pos_label='yes', zero_division=0)
test_recall_yes = recall_score(y_test, y_pred, pos_label='yes')
test_auc = roc_auc_score(y_test, y_proba[:, 1])

cm = confusion_matrix(y_test, y_pred, labels=['no', 'yes'])
report = classification_report(y_test, y_pred)

print("\n----- FINAL TEST RESULTS -----")
print(f"\nOverall Metrics:")
print(f"  Test Accuracy:      {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"  Test Precision:     {test_precision:.4f}")
print(f"  Test Recall:        {test_recall:.4f}")
print(f"  Test F1 (weighted): {test_f1_weighted:.4f}")
print(f"  Test ROC-AUC:       {test_auc:.4f}")

print("\nConfusion Matrix:")
print("                 Predicted")
print("                 no    yes")
print(f"Actual  no    {cm[0,0]:5d} {cm[0,1]:5d}")
print(f"        yes   {cm[1,0]:5d} {cm[1,1]:5d}")

total_yes_predictions = cm[0,1] + cm[1,1]
actual_yes = cm[1,0] + cm[1,1]
if total_yes_predictions > 0:
    print(f"\nPrecision Check: {cm[1,1]} correct out of {total_yes_predictions} 'yes' predictions = {cm[1,1]/total_yes_predictions*100:.1f}%")
print(f"Recall Check: {cm[1,1]} detected out of {actual_yes} actual 'yes' = {cm[1,1]/actual_yes*100:.1f}%")

print("\n" + "="*80)
print("Classification Report:")
print("="*80)
print(report)



----- FINAL TEST RESULTS -----

Overall Metrics:
  Test Accuracy:      0.8755 (87.55%)
  Test Precision:     0.9307
  Test Recall:        0.8755
  Test F1 (weighted): 0.8913
  Test ROC-AUC:       0.9619

Confusion Matrix:
                 Predicted
                 no    yes
Actual  no     3474   526
        yes      37   484

Precision Check: 484 correct out of 1010 'yes' predictions = 47.9%
Recall Check: 484 detected out of 521 actual 'yes' = 92.9%

Classification Report:
              precision    recall  f1-score   support

          no       0.99      0.87      0.93      4000
         yes       0.48      0.93      0.63       521

    accuracy                           0.88      4521
   macro avg       0.73      0.90      0.78      4521
weighted avg       0.93      0.88      0.89      4521

