In [10]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [11]:
from src.data.load_data import load_bank_data
from src.data.preprocess import split_data, BankFeatureEngineer, build_enhanced_pipeline

from src.models.naive_bayes import train_naive_bayes

# Load dataset
df = load_bank_data("../data/raw/bank-full.csv")

# Split
X_train, X_test, y_train, y_test = split_data(df)

# Detect engineered columns
fe = BankFeatureEngineer()
X_tmp = fe.fit_transform(X_train)

num_cols = X_tmp.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_tmp.select_dtypes(include=["object", "category"]).columns.tolist()

# Build preprocessing pipeline
preprocess_pipeline = build_enhanced_pipeline(num_cols, cat_cols)

# Train NB
best_nb, best_params, best_cv_f1 = train_naive_bayes(
    preprocess_pipeline,
    X_train,
    y_train
)

print("Best params:", best_params)
print("Best CV F1:", best_cv_f1)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best params: {'classifier__var_smoothing': np.float64(1e-12)}
Best CV F1: 0.3999681408568771


In [12]:
from src.models.naive_bayes import (
    find_best_threshold,
    evaluate_nb,
    save_naive_bayes,
)

# Find optimal threshold
best_threshold, best_f1 = find_best_threshold(
    best_nb,
    X_test,
    y_test
)

print("Best threshold:", best_threshold)
print("Best F1:", best_f1)

# Final evaluation
metrics = evaluate_nb(
    best_nb,
    X_test,
    y_test,
    best_threshold
)

print("\n--- FINAL NAIVE BAYES ---")
for k, v in metrics.items():
    print(f"{k}: {v}")

# Save model
save_naive_bayes(best_nb, best_threshold)

Best threshold: 0.8836734693877552
Best F1: 0.4235827664399093

--- FINAL NAIVE BAYES ---
Accuracy: 0.8594492977994028
AUC: 0.757271313296552
Precision: 0.4071490845684394
Recall: 0.44139886578449905
F1: 0.4235827664399093
MCC: 0.344062001685739
