In [None]:
! pip install -r ../requirements.txt

In [None]:
# Enterprise Data Science ML Pipeline for Bank Telemarketing (bank-additional-full.csv)
# Author: ChatGPT (GPT-5 Thinking mini)
# Purpose: End-to-end reproducible ML pipeline (no neural nets, no SVMs)
# Instructions: place this file in the same folder as 'bank-additional-full.csv' and run in a Python environment (conda/venv)

# %% Imports
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report, roc_curve, auc)
import joblib

# Optional: SHAP for interpretation
try:
    import shap
    SHAP_AVAILABLE = True
except Exception:
    SHAP_AVAILABLE = False

# %% User-configurable parameters
CSV_PATH = '../data/01-bronze/bank-additional-full.csv'  # adjust path if needed
RANDOM_STATE = 42
TEST_SIZE = 0.2
CV_FOLDS = 5
SAMPLE_FRACTION = None  # set to <1.0 if you want to run a quick experiment

# %% Utility functions

def load_data(path=CSV_PATH, sample_fraction=None):
    """Load CSV using semicolon separator used by this dataset."""
    df = pd.read_csv(path, sep=';')
    if sample_fraction is not None and 0 < sample_fraction < 1.0:
        df = df.sample(frac=sample_fraction, random_state=RANDOM_STATE)
    return df


def basic_cleaning(df):
    """Basic cleaning: strip column names, lowercase, map target to binary."""
    df.columns = [c.strip() for c in df.columns]
    # target
    if 'y' in df.columns:
        df['y'] = df['y'].map({'yes': 1, 'no': 0}).astype(int)
    return df


def get_feature_lists(df):
    """Return lists of numerical and categorical columns for the dataset."""
    # from dataset documentation, some columns should be numeric
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    if 'y' in num_cols:
        num_cols.remove('y')
    # treat some numeric-like columns as categorical if needed
    cat_cols = [c for c in df.columns if c not in num_cols + ['y']]
    return num_cols, cat_cols


def evaluate_model(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    y_pred_proba = None
    if hasattr(clf, 'predict_proba'):
        y_pred_proba = clf.predict_proba(X_test)[:, 1]
    elif hasattr(clf, 'decision_function'):
        y_pred_proba = clf.decision_function(X_test)

    metrics = {}
    metrics['accuracy'] = accuracy_score(y_test, y_pred)
    metrics['precision'] = precision_score(y_test, y_pred)
    metrics['recall'] = recall_score(y_test, y_pred)
    metrics['f1'] = f1_score(y_test, y_pred)
    if y_pred_proba is not None:
        metrics['roc_auc'] = roc_auc_score(y_test, y_pred_proba)
    else:
        metrics['roc_auc'] = None
    metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
    metrics['classification_report'] = classification_report(y_test, y_pred)
    return metrics, y_pred_proba


def lift_score(y_true, y_proba, top_pct=0.1):
    """Compute uplift/lift metric: ratio of positive rate in top_pct predicted vs overall."""
    df = pd.DataFrame({'y': y_true, 'p': y_proba})
    df_sorted = df.sort_values('p', ascending=False)
    cutoff = int(len(df_sorted) * top_pct)
    top = df_sorted.head(cutoff)
    uplift = (top['y'].mean() / df['y'].mean()) if df['y'].mean() > 0 else np.nan
    return uplift

# %% Main pipeline builder

def build_preprocessor(num_cols, cat_cols):
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ], remainder='drop', verbose_feature_names_out=False)
    return preprocessor

from tqdm import tqdm

# %% Model candidates + hyperparameter grids

def get_models_and_grids():
    models = {
        'logistic': LogisticRegression(solver='liblinear', class_weight='balanced', random_state=RANDOM_STATE),
        'decision_tree': DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight='balanced'),
        'random_forest': RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE, class_weight='balanced'),
        'xgboost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1, random_state=RANDOM_STATE)
    }

    grids = {
    'logistic': {
        'clf__C': [0.01, 0.1, 1.0, 10.0, 50.0, 100.0],
        'clf__penalty': ['l1', 'l2'],  # L1 = sparsity, L2 = standard
        'clf__solver': ['liblinear', 'saga']  # saga allows l1/l2 with larger datasets
    },
    'decision_tree': {
        'clf__max_depth': [3, 5, 10, 15, None],
        'clf__min_samples_leaf': [1, 2, 5, 10, 20],
        'clf__min_samples_split': [2, 5, 10, 20],
        'clf__max_features': [None, 'sqrt', 'log2']
    },
    'random_forest': {
        'clf__n_estimators': [100, 300, 500],
        'clf__max_depth': [5, 10, 15, None],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 5, 10],
        'clf__max_features': ['sqrt', 'log2', None],
        'clf__bootstrap': [True, False]
    },
    'xgboost': {
        'clf__n_estimators': [100, 300, 500],
        'clf__max_depth': [3, 5, 6, 8],
        'clf__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'clf__subsample': [0.6, 0.8, 1.0],
        'clf__colsample_bytree': [0.6, 0.8, 1.0],
        'clf__gamma': [0, 0.1, 0.2],
        'clf__reg_alpha': [0, 0.01, 0.1],
        'clf__reg_lambda': [1, 1.5, 2]
    }}
    return models, grids

# %% Orchestrator: trains CV + returns best estimator per model

def run_model_selection(X_train, y_train, preprocessor, models, grids, cv=CV_FOLDS):
    best_estimators = {}
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    for name, model in tqdm(models.items(), desc='Model Selection', total=len(models)):
        print(f"\nTraining and tuning: {name}")
        pipe = Pipeline([
            ('pre', preprocessor),
            ('clf', model)
        ])
        param_grid = grids.get(name, {})
        gs = GridSearchCV(pipe, param_grid=param_grid, scoring='roc_auc', cv=skf, n_jobs=1, verbose=2)
        gs.fit(X_train, y_train)
        print(f"Best {name} score: {gs.best_score_:.4f}")
        print(f"Best params: {gs.best_params_}")
        best_estimators[name] = gs.best_estimator_
    return best_estimators

from tqdm import tqdm

# %% Run everything (main)
if __name__ == '__main__':
    print('Loading data...')
    if not os.path.exists(CSV_PATH):
        raise FileNotFoundError(f"CSV not found at {CSV_PATH}. Please place the dataset in the working directory or update CSV_PATH.")

    df = load_data(CSV_PATH, sample_fraction=SAMPLE_FRACTION)
    df = basic_cleaning(df)

    print('Preparing feature lists...')
    num_cols, cat_cols = get_feature_lists(df)
    print(f'Numerical columns ({len(num_cols)}): {num_cols}')
    print(f'Categorical columns ({len(cat_cols)}): {cat_cols}')

    # Optional: drop 'duration' if you want a realistic pre-call model (duration is only known after call)
    if 'duration' in df.columns:
        print('\nRemoving `duration` from features for realistic pre-call predictions (recommended).')
        df = df.drop(columns=['duration'])
        num_cols, cat_cols = get_feature_lists(df)

    X = df.drop(columns=['y'])
    y = df['y']

    print('Splitting train/test')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                        stratify=y, random_state=RANDOM_STATE)

    preprocessor = build_preprocessor(num_cols, cat_cols)
    models, grids = get_models_and_grids()

    best_estimators = run_model_selection(X_train, y_train, preprocessor, models, grids)

    # Evaluate on test set
    results = {}
    for name, est in best_estimators.items():
        print(f"\nEvaluating best {name} on test set")
        metrics, y_proba = evaluate_model(est, X_test, y_test)
        # compute lift
        if y_proba is not None:
            metrics['lift_top_10pct'] = lift_score(y_test.reset_index(drop=True), y_proba, top_pct=0.10)
        results[name] = metrics
        print(metrics['classification_report'])
        print('ROC AUC:', metrics['roc_auc'])
        print('Lift (top 10%):', metrics.get('lift_top_10pct'))

    # Choose final model (e.g., best ROC AUC on test)
    best_name = max(results.items(), key=lambda kv: (kv[1]['roc_auc'] or 0))[0]
    final_model = best_estimators[best_name]
    print(f"\nSelected final model: {best_name}")

    # Save final model
    joblib.dump(final_model, f'final_model_{best_name}.joblib')
    print(f"Final model saved to final_model_{best_name}.joblib")

    # Optional: SHAP explainability
    if SHAP_AVAILABLE:
        print('\nComputing SHAP values for final model (may take time)...')
        # We need the preprocessed matrix for SHAP
        X_train_transformed = final_model.named_steps['pre'].transform(X_train)
        # For tree models use TreeExplainer
        explainer = shap.TreeExplainer(final_model.named_steps['clf'])
        shap_values = explainer.shap_values(X_train_transformed)
        shap.summary_plot(shap_values, X_train_transformed)
    else:
        print('\nSHAP not available. To enable, `pip install shap`.')

    print('\nPipeline complete. Review results dictionary `results` for full metrics.')


In [None]:
import pprint

In [None]:
pprint.pprint(results, width=120)