In [1]:
# model_comparison_full.py
import pandas as pd
import numpy as np
import time
import optuna
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

# =======================
# 1. Load and preprocess data
# =======================
df = pd.read_csv("Bank_Marketing.csv")

# Encode categorical variables
cat_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Split features and target
X = df.drop("TARGET", axis=1)
y = df["TARGET"]

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =======================
# 2. Model initialization
# =======================
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# =======================
# 3. Evaluate model
# =======================
def evaluate_model(name, model):
    start = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    end = time.time()

    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_proba),
        "Training Time (s)": end - start
    }

results = []

# =======================
# 4. Base model evaluation
# =======================
for name, model in models.items():
    results.append(evaluate_model(name + " (Base)", model))

# =======================
# 5. RandomizedSearchCV tuning
# =======================
tuning_params = {
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l2"]
    },
    "Decision Tree": {
        "max_depth": [3, 5, 10, 20],
        "min_samples_split": [2, 5, 10]
    },
    "Random Forest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [5, 10, 15],
        "min_samples_split": [2, 5, 10]
    },
    "SVM": {
        "C": [0.1, 1, 10],
        "gamma": ['scale', 'auto'],
        "kernel": ["rbf", "linear"]
    },
    "XGBoost": {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7]
    }
}

for name in models:
    rsearch = RandomizedSearchCV(models[name], tuning_params[name], n_iter=10, cv=3, scoring="f1", n_jobs=-1)
    rsearch.fit(X_train, y_train)
    results.append(evaluate_model(name + " (RandomSearchCV)", rsearch.best_estimator_))

# =======================
# 6. Optuna tuning
# =======================
def optuna_objective(trial, model_name):
    if model_name == "Logistic Regression":
        model = LogisticRegression(C=trial.suggest_float("C", 0.01, 10.0), penalty="l2")
    elif model_name == "Decision Tree":
        model = DecisionTreeClassifier(
            max_depth=trial.suggest_int("max_depth", 3, 20),
            min_samples_split=trial.suggest_int("min_samples_split", 2, 20))
    elif model_name == "Random Forest":
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 300),
            max_depth=trial.suggest_int("max_depth", 5, 20),
            min_samples_split=trial.suggest_int("min_samples_split", 2, 20))
    elif model_name == "SVM":
        model = SVC(
            C=trial.suggest_float("C", 0.1, 10.0),
            gamma=trial.suggest_categorical("gamma", ["scale", "auto"]),
            kernel=trial.suggest_categorical("kernel", ["rbf", "linear"]),
            probability=True)
    elif model_name == "XGBoost":
        model = XGBClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 300),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            use_label_encoder=False, eval_metric='logloss')

    score = cross_val_score(model, X_train, y_train, cv=3, scoring="f1").mean()
    return score

for model_name in models:
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: optuna_objective(trial, model_name), n_trials=20)
    best_params = study.best_params
    best_model = None

    if model_name == "Logistic Regression":
        best_model = LogisticRegression(**best_params)
    elif model_name == "Decision Tree":
        best_model = DecisionTreeClassifier(**best_params)
    elif model_name == "Random Forest":
        best_model = RandomForestClassifier(**best_params)
    elif model_name == "SVM":
        best_model = SVC(**best_params, probability=True)
    elif model_name == "XGBoost":
        best_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')

    results.append(evaluate_model(model_name + " (Optuna)", best_model))

# =======================
# 7. Save & Show Results
# =======================
results_df = pd.DataFrame(results)
results_df.to_csv("results/model_comparison_all.csv", index=False)
print("\n===== Final Comparison Table =====")
print(results_df.sort_values(by="F1 Score", ascending=False).reset_index(drop=True))

[I 2025-04-11 13:36:48,756] A new study created in memory with name: no-name-b2ea6e8e-3e23-4d94-b6e2-23f912efc84d
[I 2025-04-11 13:36:49,258] Trial 0 finished with value: 0.33082894416924025 and parameters: {'C': 9.80412504519501}. Best is trial 0 with value: 0.33082894416924025.
[I 2025-04-11 13:36:49,375] Trial 1 finished with value: 0.33082894416924025 and parameters: {'C': 8.950961476107652}. Best is trial 0 with value: 0.33082894416924025.
[I 2025-04-11 13:36:49,476] Trial 2 finished with value: 0.33088896494566283 and parameters: {'C': 1.5384180062565578}. Best is trial 2 with value: 0.33088896494566283.
[I 2025-04-11 13:36:49,586] Trial 3 finished with value: 0.33082894416924025 and parameters: {'C': 5.836099872911405}. Best is trial 2 with value: 0.33088896494566283.
[I 2025-04-11 13:36:49,707] Trial 4 finished with value: 0.33082894416924025 and parameters: {'C': 5.6463264384735945}. Best is trial 2 with value: 0.33088896494566283.
[I 2025-04-11 13:36:49,812] Trial 5 finished 


===== Final Comparison Table =====
                                   Model  Accuracy  Precision    Recall  \
0                       XGBoost (Optuna)  0.908327   0.663342  0.487626   
1                         XGBoost (Base)  0.903461   0.630072  0.483960   
2                 Random Forest (Optuna)  0.902355   0.648571  0.416132   
3                   Random Forest (Base)  0.900254   0.637155  0.402383   
4         Random Forest (RandomSearchCV)  0.900586   0.642433  0.396884   
5                 Decision Tree (Optuna)  0.894725   0.593540  0.404216   
6         Decision Tree (RandomSearchCV)  0.892071   0.573248  0.412466   
7                   Decision Tree (Base)  0.873383   0.475543  0.481210   
8                           SVM (Optuna)  0.896163   0.627090  0.343721   
9                   SVM (RandomSearchCV)  0.896052   0.626043  0.343721   
10                            SVM (Base)  0.895389   0.653277  0.283226   
11              XGBoost (RandomSearchCV)  0.896384   0.696429  0