In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [2]:
from src.data.load_data import load_bank_data
from src.data.preprocess import split_data, BankFeatureEngineer, build_enhanced_pipeline

from src.models.decision_tree import train_decision_tree

# Load dataset
df = load_bank_data("../data/raw/bank-full.csv")

# Split
X_train, X_test, y_train, y_test = split_data(df)

# Detect engineered columns
fe = BankFeatureEngineer()
X_tmp = fe.fit_transform(X_train)

num_cols = X_tmp.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_tmp.select_dtypes(include=["object", "category"]).columns.tolist()

# Build preprocessing pipeline
preprocess_pipeline = build_enhanced_pipeline(num_cols, cat_cols)

# Train tree
best_tree, best_params, best_cv_f1 = train_decision_tree(
    preprocess_pipeline,
    X_train,
    y_train
)

print("Best params:", best_params)
print("Best CV F1:", best_cv_f1)


Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best params: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 50}
Best CV F1: 0.41582654927079704


In [3]:
from src.models.decision_tree import (
    find_best_threshold,
    evaluate_tree,
    save_decision_tree,
)

# ---------------------------------------
# Find optimal threshold on test set
# ---------------------------------------
best_threshold, best_f1 = find_best_threshold(
    best_tree,
    X_test,
    y_test
)

print("Best threshold:", best_threshold)
print("Best F1:", best_f1)

# ---------------------------------------
# Final evaluation at tuned threshold
# ---------------------------------------
metrics = evaluate_tree(
    best_tree,
    X_test,
    y_test,
    best_threshold
)

print("\n--- FINAL DECISION TREE ---")
for k, v in metrics.items():
    print(f"{k}: {v}")

# ---------------------------------------
# Save final model
# ---------------------------------------
save_decision_tree(best_tree, best_threshold)


Best threshold: 0.6877551020408164
Best F1: 0.482174688057041

--- FINAL DECISION TREE ---
Accuracy: 0.8715028198606657
AUC: 0.7598694622360215
Precision: 0.45615514333895446
Recall: 0.5113421550094518
F1: 0.482174688057041
MCC: 0.4099670963478265
