In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [3]:
from src.data.load_data import load_bank_data
from src.data.preprocess import split_data, BankFeatureEngineer, build_enhanced_pipeline

from src.models.random_forest import train_random_forest

# Load dataset
df = load_bank_data("../data/raw/bank-full.csv")

# Split
X_train, X_test, y_train, y_test = split_data(df)

# Detect engineered columns
fe = BankFeatureEngineer()
X_tmp = fe.fit_transform(X_train)

num_cols = X_tmp.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_tmp.select_dtypes(include=["object", "category"]).columns.tolist()

# Build preprocessing pipeline
preprocess_pipeline = build_enhanced_pipeline(num_cols, cat_cols)

# Train RF
best_rf, best_params, best_cv_f1 = train_random_forest(
    preprocess_pipeline,
    X_train,
    y_train
)

print("Best params:", best_params)
print("Best CV F1:", best_cv_f1)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params: {'classifier__max_depth': None, 'classifier__max_features': 0.5, 'classifier__min_samples_leaf': 10, 'classifier__n_estimators': 400}
Best CV F1: 0.4737716642994835


In [4]:
from src.models.random_forest import (
    find_best_threshold,
    evaluate_rf,
    save_random_forest,
)

# Find optimal threshold
best_threshold, best_f1 = find_best_threshold(
    best_rf,
    X_test,
    y_test
)

print("Best threshold:", best_threshold)
print("Best F1:", best_f1)

# Final evaluation
metrics = evaluate_rf(
    best_rf,
    X_test,
    y_test,
    best_threshold
)

print("\n--- FINAL RANDOM FOREST ---")
for k, v in metrics.items():
    print(f"{k}: {v}")

# Save model
save_random_forest(best_rf, best_threshold)


Best threshold: 0.5244897959183674
Best F1: 0.4891832229580574

--- FINAL RANDOM FOREST ---
Accuracy: 0.872055733716687
AUC: 0.7989012953162415
Precision: 0.45898922949461474
Recall: 0.5236294896030246
F1: 0.4891832229580574
MCC: 0.4175951443281807
