In [4]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [None]:
from src.data.load_data import load_bank_data
from src.data.preprocess import split_data, BankFeatureEngineer, build_enhanced_pipeline

from src.models.xgboost_model import train_xgboost

# Load dataset
df = load_bank_data("../data/raw/bank-full.csv")

# Split
X_train, X_test, y_train, y_test = split_data(df)

# Detect engineered columns
fe = BankFeatureEngineer()
X_tmp = fe.fit_transform(X_train)

num_cols = X_tmp.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_tmp.select_dtypes(include=["object", "category"]).columns.tolist()

# Build preprocessing pipeline
preprocess_pipeline = build_enhanced_pipeline(num_cols, cat_cols)

# Train XGB
best_xgb, best_params, best_cv_f1 = train_xgboost(
    preprocess_pipeline,
    X_train,
    y_train
)

print("Best params:", best_params)
print("Best CV F1:", best_cv_f1)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__colsample_bytree': 0.9, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 8, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 400, 'classifier__subsample': 0.9}
Best CV F1: 0.3730968659897774


In [None]:
from src.models.xgboost_model import (
    find_best_threshold,
    evaluate_xgb,
    save_xgboost,
)

# Find optimal threshold
best_threshold, best_f1 = find_best_threshold(
    best_xgb,
    X_test,
    y_test
)

print("Best threshold:", best_threshold)
print("Best F1:", best_f1)

# Final evaluation
metrics = evaluate_xgb(
    best_xgb,
    X_test,
    y_test,
    best_threshold
)

print("\n--- FINAL XGBOOST ---")
for k, v in metrics.items():
    print(f"{k}: {v}")

# Save model
save_xgboost(best_xgb, best_threshold)

Best threshold: 0.1979591836734694
Best F1: 0.4958094397882664

--- FINAL XGBOOST ---
Accuracy: 0.8736038925135464
AUC: 0.7971071704625757
Precision: 0.46484698097601324
Recall: 0.5311909262759924
F1: 0.4958094397882664
MCC: 0.4251538556403974
