In [6]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [7]:
from src.data.load_data import load_bank_data
from src.data.preprocess import split_data, BankFeatureEngineer, build_enhanced_pipeline

from src.models.knn import train_knn

# Load dataset
df = load_bank_data("../data/raw/bank-full.csv")

# Split
X_train, X_test, y_train, y_test = split_data(df)

# Detect engineered columns
fe = BankFeatureEngineer()
X_tmp = fe.fit_transform(X_train)

num_cols = X_tmp.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_tmp.select_dtypes(include=["object", "category"]).columns.tolist()

# Build preprocessing pipeline
preprocess_pipeline = build_enhanced_pipeline(num_cols, cat_cols)

# Train KNN
best_knn, best_params, best_cv_f1 = train_knn(
    preprocess_pipeline,
    X_train,
    y_train
)

print("Best params:", best_params)
print("Best CV F1:", best_cv_f1)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 5, 'classifier__weights': 'distance'}
Best CV F1: 0.30129150255297354


In [9]:
from src.models.knn import (
    find_best_threshold,
    evaluate_knn,
    save_knn,
)

# Find optimal threshold
best_threshold, best_f1 = find_best_threshold(
    best_knn,
    X_test,
    y_test
)

print("Best threshold:", best_threshold)
print("Best F1:", best_f1)

# Final evaluation
metrics = evaluate_knn(
    best_knn,
    X_test,
    y_test,
    best_threshold
)

print("\n--- FINAL KNN ---")
for k, v in metrics.items():
    print(f"{k}: {v}")

# Save model
save_knn(best_knn, best_threshold)


Best threshold: 0.263265306122449
Best F1: 0.3770343580470163

--- FINAL KNN ---
Accuracy: 0.8476169412805485
AUC: 0.7030850022430999
Precision: 0.3613518197573657
Recall: 0.3941398865784499
F1: 0.3770343580470163
MCC: 0.29076748069046
