# Train All 6 Classification Models - Bank Marketing (UCI)
Predict term deposit subscription. Saves models as .pkl

In [None]:
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

RANDOM_STATE = 42
TEST_SIZE = 0.2
TARGET_COLUMN = "y"

In [None]:
print("Fetching Bank Marketing dataset...")
data = fetch_openml(name="bank-marketing", version=1, as_frame=True, parser="auto")
df = pd.concat([data.data, data.target], axis=1)
df.columns = df.columns.astype(str)
target_col = df.columns[-1]
df = df.rename(columns={target_col: TARGET_COLUMN})
df[TARGET_COLUMN] = df[TARGET_COLUMN].astype(str).str.strip().str.lower()
feature_cols = [c for c in df.columns if c != TARGET_COLUMN]
print(f"Shape: {df.shape}, Features: {len(feature_cols)}")
print(f"Target classes: {df[TARGET_COLUMN].unique().tolist()}")

In [None]:
try:
    base_dir = Path(__file__).resolve().parent
except NameError:
    base_dir = Path.cwd()
output_dir = base_dir
output_dir.mkdir(parents=True, exist_ok=True)

numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in feature_cols if c not in numeric_cols]
transformers = []
if numeric_cols:
    transformers.append(("num", StandardScaler(), numeric_cols))
if cat_cols:
    transformers.append(("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols))
if not transformers:
    transformers.append(("num", StandardScaler(), feature_cols))
preprocessor = ColumnTransformer(transformers, remainder="passthrough")

In [None]:
x = df[feature_cols]
y = df[TARGET_COLUMN]
le = LabelEncoder()
y_enc = le.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y_enc, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_enc)
preprocessor.fit(x_train)
print(f"Train: {len(x_train)}, Test: {len(x_test)}")

In [None]:
def make_models():
    return {
        "Logistic Regression": Pipeline([("prep", preprocessor), ("model", LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))]),
        "Decision Tree": Pipeline([("prep", preprocessor), ("model", DecisionTreeClassifier(random_state=RANDOM_STATE))]),
        "KNN": Pipeline([("prep", preprocessor), ("model", KNeighborsClassifier(n_neighbors=5))]),
        "Naive Bayes": Pipeline([("prep", preprocessor), ("model", GaussianNB())]),
        "Random Forest (Ensemble)": Pipeline([("prep", preprocessor), ("model", RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE))]),
        "XGBoost (Ensemble)": Pipeline([("prep", preprocessor), ("model", XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=4, objective="binary:logistic", eval_metric="logloss", random_state=RANDOM_STATE))]),
    }

def evaluate_model(model, x_test, y_test):
    preds = model.predict(x_test)
    try:
        probs = model.predict_proba(x_test)[:, 1]
        auc = roc_auc_score(y_test, probs)
    except Exception:
        auc = 0.0
    return {"Accuracy": accuracy_score(y_test, preds), "AUC": auc, "Precision": precision_score(y_test, preds, average="binary", zero_division=0), "Recall": recall_score(y_test, preds, average="binary", zero_division=0), "F1 Score": f1_score(y_test, preds, average="binary", zero_division=0), "MCC": matthews_corrcoef(y_test, preds)}

models = make_models()
metrics = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(x_train, y_train)
    metrics[name] = evaluate_model(model, x_test, y_test)
    safe_name = name.lower().replace(" ", "_").replace("(", "").replace(")", "")
    with open(output_dir / f"{safe_name}.pkl", "wb") as f:
        pickle.dump(model, f)
print("Done!")

In [None]:
with open(output_dir / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)
test_df = x_test.copy()
test_df[TARGET_COLUMN] = le.inverse_transform(y_test)
test_df.to_csv(output_dir / "test_data.csv", index=False)
with open(output_dir / "feature_names.json", "w") as f:
    json.dump({"features": feature_cols, "target": TARGET_COLUMN, "classes": le.classes_.tolist()}, f, indent=2)
with open(output_dir / "label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)
print("Saved to:", output_dir)
pd.DataFrame(metrics).T.round(4)