# Which client and campaign features best predict term-deposit subscription?

## Plan
- Load bank marketing data (Data/bank-additional-full.csv)
- Clean/prepare: treat unknown, drop leak-prone duration for main model; stratified split
- Build preprocessing: one-hot categoricals, scale numerics
- Train baseline logistic regression with class weights; evaluate ROC-AUC
- Add tree model (e.g., Gradient Boosting) to capture interactions
- Interpret: permutation importance, SHAP-like global effects (via tree feature importances), and top feature combinations with lifts
- Summarize strongest client+campaign combos predicting subscription


In [56]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingClassifier

pd.set_option('display.max_columns', 200)

data_path = Path('Data/bank-additional-full.csv')
df = pd.read_csv(data_path, sep=';')
print(df.shape)
df.head()

(41188, 21)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [57]:
target = 'y'
y = (df[target] == 'yes').astype(int)
X = df.drop(columns=[target])

# Categorical vs numeric
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]
print('Categorical:', cat_cols)
print('Numeric:', num_cols)


Categorical: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
Numeric: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']


In [58]:
# Drop duration for main model to avoid leakage
if 'duration' in num_cols:
    num_cols.remove('duration')
    X = X.drop(columns=['duration'])

# Simple handling: keep "unknown" as category; fill numeric missing if any

preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', Pipeline([('scaler', StandardScaler())]), num_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [59]:
log_clf = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=200, class_weight='balanced', n_jobs=None))
])

log_clf

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,200


In [60]:
log_clf.fit(X_train, y_train)
train_pred = log_clf.predict_proba(X_train)[:,1]
test_pred = log_clf.predict_proba(X_test)[:,1]
print('ROC-AUC train:', roc_auc_score(y_train, train_pred))
print('ROC-AUC test :', roc_auc_score(y_test, test_pred))
print('Test report:', classification_report(y_test, (test_pred>0.5).astype(int)))


ROC-AUC train: 0.7950978792728993
ROC-AUC test : 0.8009417454832776
Test report:               precision    recall  f1-score   support

           0       0.95      0.86      0.90      7310
           1       0.37      0.65      0.47       928

    accuracy                           0.84      8238
   macro avg       0.66      0.75      0.69      8238
weighted avg       0.88      0.84      0.85      8238



In [61]:
# Permutation importance on test set
feature_names = X_test.columns
r = permutation_importance(log_clf, X_test, y_test, n_repeats=5, random_state=42, n_jobs=-1)
importances = pd.DataFrame({
    'feature': feature_names,
    'importance_mean': r.importances_mean,
    'importance_std': r.importances_std
}).sort_values(by='importance_mean', ascending=False)
importances.head(20)


Unnamed: 0,feature,importance_mean,importance_std
14,emp.var.rate,0.217286,0.003068
15,cons.price.idx,0.17429,0.00232
8,month,0.074921,0.002758
17,euribor3m,0.061811,0.001017
18,nr.employed,0.054479,0.001404
16,cons.conf.idx,0.019665,0.001061
11,pdays,0.018014,0.00106
13,poutcome,0.005268,0.000988
1,job,0.003059,0.001445
9,day_of_week,0.002962,0.000785


In [62]:
# Tree-based model to capture interactions
# GradientBoosting is lightweight and works with preprocessed encoded data

gb_clf = Pipeline([
    ('preprocess', preprocess),
    ('model', GradientBoostingClassifier(random_state=42))
])

gb_clf.fit(X_train, y_train)
train_pred_gb = gb_clf.predict_proba(X_train)[:,1]
test_pred_gb = gb_clf.predict_proba(X_test)[:,1]
print('GB ROC-AUC train:', roc_auc_score(y_train, train_pred_gb))
print('GB ROC-AUC test :', roc_auc_score(y_test, test_pred_gb))


GB ROC-AUC train: 0.8051005922190891
GB ROC-AUC test : 0.8091446677909335


In [63]:
# Feature importances from GB (on encoded features)
imp_gb = pd.DataFrame({
    'feature': gb_clf.named_steps['preprocess'].get_feature_names_out(),
    'importance': gb_clf.named_steps['model'].feature_importances_
}).sort_values('importance', ascending=False)
imp_gb.head(20)


Unnamed: 0,feature,importance
61,num__nr.employed,0.59867
55,num__pdays,0.087661
60,num__euribor3m,0.070736
59,num__cons.conf.idx,0.054604
52,cat__poutcome_success,0.038529
43,cat__month_oct,0.022617
53,num__age,0.019678
50,cat__poutcome_failure,0.013182
46,cat__day_of_week_mon,0.011549
58,num__cons.price.idx,0.011317


In [64]:
# Identify top feature combinations (simple approach):
# Take top categorical features and compute lift for their top categories

def top_lift_pairs(df, y_series, top_features, min_count=50):
    results = []
    base_rate = y_series.mean()
    for feat in top_features:
        if feat not in df.columns:
            continue
        for val, sub in df.groupby(feat):
            if len(sub) < min_count:
                continue
            rate = y_series.loc[sub.index].mean()
            lift = rate / base_rate
            results.append((feat, val, len(sub), rate, lift))
    res_df = pd.DataFrame(results, columns=['feature','value','count','conversion','lift']).sort_values('lift', ascending=False)
    return res_df

cat_order = (importances[importances['importance_mean']>0]
             .assign(col=lambda d: d['feature'].str.split('__').str[0])
             .groupby('col')['importance_mean'].sum()
             .sort_values(ascending=False))

# Choose top categorical columns by summed permutation importance
top_cats = [c for c in cat_order.head(6).index if c in df.columns]
print('Top categorical fields for combo search:', top_cats)

lift_table = top_lift_pairs(df, y, top_cats, min_count=80)
lift_table.head(20)


Top categorical fields for combo search: ['emp.var.rate', 'cons.price.idx', 'month', 'euribor3m', 'nr.employed', 'cons.conf.idx']


Unnamed: 0,feature,value,count,conversion,lift
44,euribor3m,0.715,135,0.62963,5.589049
23,cons.price.idx,93.876,212,0.575472,5.108304
123,cons.conf.idx,-40.0,212,0.575472,5.108304
20,cons.price.idx,93.369,264,0.568182,5.043593
129,cons.conf.idx,-34.8,264,0.568182,5.043593
105,nr.employed,5008.7,650,0.567692,5.039248
122,cons.conf.idx,-40.3,311,0.565916,5.023484
29,cons.price.idx,94.215,311,0.565916,5.023484
22,cons.price.idx,93.749,174,0.557471,4.948519
130,cons.conf.idx,-34.6,174,0.557471,4.948519


In [65]:
# Column-level permutation importance (raw columns) to rank fields
from sklearn.metrics import roc_auc_score

def column_permutation_importance(model, X, y, cols, n_repeats=5, random_state=42):
    rng = np.random.default_rng(random_state)
    base_pred = model.predict_proba(X)[:, 1]
    base_auc = roc_auc_score(y, base_pred)
    rows = []
    for col in cols:
        drops = []
        for i in range(n_repeats):
            X_shuffled = X.copy()
            X_shuffled[col] = rng.permutation(X_shuffled[col].values)
            pred = model.predict_proba(X_shuffled)[:, 1]
            drops.append(base_auc - roc_auc_score(y, pred))
        rows.append((col, np.mean(drops)))
    imp_df = pd.DataFrame(rows, columns=['feature', 'importance']).sort_values('importance', ascending=False)
    return imp_df

raw_cols = X_train.columns.tolist()
col_importance = column_permutation_importance(log_clf, X_test, y_test, raw_cols, n_repeats=5, random_state=42)
col_importance.head(10)

Unnamed: 0,feature,importance
14,emp.var.rate,0.375138
15,cons.price.idx,0.057237
8,month,0.024865
7,contact,0.016276
11,pdays,0.007628
17,euribor3m,0.006732
18,nr.employed,0.006335
9,day_of_week,0.00313
13,poutcome,0.002203
1,job,0.000666


In [66]:
# Evaluate models using top-k raw features to find a compact set

def evaluate_top_k(top_features, k_values):
    results = []
    for k in k_values:
        use_cols = top_features[:k]
        use_cat = [c for c in use_cols if c in cat_cols]
        use_num = [c for c in use_cols if c in num_cols]
        pre = ColumnTransformer([
            ('cat', OneHotEncoder(handle_unknown='ignore'), use_cat),
            ('num', Pipeline([('scaler', StandardScaler())]), use_num)
        ])
        clf = Pipeline([
            ('preprocess', pre),
            ('model', LogisticRegression(max_iter=200, class_weight='balanced'))
        ])
        clf.fit(X_train[use_cols], y_train)
        auc = roc_auc_score(y_test, clf.predict_proba(X_test[use_cols])[:,1])
        results.append((k, auc))
    return pd.DataFrame(results, columns=['k_features','test_auc']).sort_values('test_auc', ascending=False)

k_grid = [5, 8, 12, 15, len(col_importance)]
k_grid = sorted(set([k for k in k_grid if k <= len(col_importance)]))
compact_results = evaluate_top_k(col_importance['feature'].tolist(), k_grid)
compact_results

Unnamed: 0,k_features,test_auc
2,12,0.804031
3,15,0.80229
1,8,0.801854
4,19,0.800942
0,5,0.797961


In [69]:
# Fit and evaluate using the best-k features from the top-k sweep
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

best_row = compact_results.sort_values('test_auc', ascending=False).iloc[0]
best_k = int(best_row['k_features'])
top_features = col_importance['feature'].tolist()[:best_k]

use_cat = [c for c in top_features if c in cat_cols]
use_num = [c for c in top_features if c in num_cols]

best_pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), use_cat),
    ('num', Pipeline([('scaler', StandardScaler())]), use_num)
])

best_clf = Pipeline([
    ('preprocess', best_pre),
    ('model', LogisticRegression(max_iter=200, class_weight='balanced'))
])

best_clf.fit(X_train[top_features], y_train)

test_proba = best_clf.predict_proba(X_test[top_features])[:, 1]
test_pred = (test_proba >= 0.5).astype(int)

auc_score = roc_auc_score(y_test, test_proba)
acc = accuracy_score(y_test, test_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, test_pred, average='binary', zero_division=0)
cm = confusion_matrix(y_test, test_pred)
cm_df = pd.DataFrame(cm, index=['true no','true yes'], columns=['pred no','pred yes'])

print(f'Best-k pipeline (k={best_k}) | sweep AUC={best_row.test_auc:.3f} | refit AUC={auc_score:.3f}')
print('Classification report (test):')
print(classification_report(y_test, test_pred, target_names=['no','yes'], zero_division=0))
print(f'Accuracy={acc:.3f} | Precision={prec:.3f} | Recall={rec:.3f} | F1={f1:.3f}')
cm_df


Best-k pipeline (k=12) | sweep AUC=0.804 | refit AUC=0.804
Classification report (test):
              precision    recall  f1-score   support

          no       0.95      0.86      0.90      7310
         yes       0.37      0.64      0.47       928

    accuracy                           0.84      8238
   macro avg       0.66      0.75      0.69      8238
weighted avg       0.88      0.84      0.85      8238

Accuracy=0.836 | Precision=0.368 | Recall=0.643 | F1=0.468


Unnamed: 0,pred no,pred yes
true no,6286,1024
true yes,331,597


In [74]:
# Boost recall without changing threshold: class-weight sweep at 0.5 cutoff
cw_grid = [1.0, 2.0, 3.0, 4.0, 5.0]
rows = []
for w in cw_grid:
    pre = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), [c for c in top_features if c in cat_cols]),
        ('num', Pipeline([('scaler', StandardScaler())]), [c for c in top_features if c in num_cols])
    ])
    clf = Pipeline([
        ('preprocess', pre),
        ('model', LogisticRegression(max_iter=300, class_weight={0:1, 1:w}))
    ])
    clf.fit(X_train[top_features], y_train)
    proba = clf.predict_proba(X_test[top_features])[:,1]
    pred = (proba >= 0.5).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, pred, average='binary', zero_division=0)
    acc = accuracy_score(y_test, pred)
    auc = roc_auc_score(y_test, proba)
    rows.append((w, auc, acc, prec, rec, f1))

cw_df = pd.DataFrame(rows, columns=['pos_class_weight','auc','accuracy','precision','recall','f1'])
print('Class-weight sweep (threshold=0.5):')
print(cw_df.sort_values('recall', ascending=False))

best_cw = cw_df.sort_values(['recall','precision'], ascending=[False, False]).iloc[0]['pos_class_weight']
print(f"Selected pos_class_weight={best_cw} for higher recall at fixed threshold 0.5")


Class-weight sweep (threshold=0.5):
   pos_class_weight       auc  accuracy  precision    recall        f1
4               5.0  0.803755  0.860403   0.419565  0.623922  0.501733
3               4.0  0.803832  0.873027   0.451560  0.592672  0.512582
2               3.0  0.803821  0.883588   0.483938  0.503233  0.493397
1               2.0  0.803790  0.892328   0.529078  0.401940  0.456828
0               1.0  0.803947  0.901797   0.705882  0.219828  0.335251
Selected pos_class_weight=5.0 for higher recall at fixed threshold 0.5


In [None]:
# Interactive prediction for most important features
import ipywidgets as widgets
from IPython.display import display, clear_output

# Use top_features from best-k cell
sel_features = top_features

# Build widgets for selected features
cat_opts = {c: sorted(df[c].dropna().unique().tolist()) for c in sel_features if c in cat_cols}
num_stats = {c: (df[c].quantile(0.01), df[c].median(), df[c].quantile(0.99)) for c in sel_features if c in num_cols}

widgets_dict = {}
for c, opts in cat_opts.items():
    widgets_dict[c] = widgets.Dropdown(options=opts, value=opts[0], description=c, layout=widgets.Layout(width='300px'))
for c, (lo, med, hi) in num_stats.items():
    widgets_dict[c] = widgets.FloatText(value=float(med), description=c, layout=widgets.Layout(width='300px'))

predict_btn = widgets.Button(description='Predict subscription', button_style='success')
out = widgets.Output()


use_thr = 0.2

@predict_btn.on_click
def _predict(_):
    with out:
        clear_output()
        row = {c: widgets_dict[c].value for c in sel_features}
        X_single = pd.DataFrame([row])
        proba = best_clf.predict_proba(X_single[sel_features])[:,1][0]
        pred = int(proba >= use_thr)
        print(f"Predicted probability of subscription: {proba:.3f}")
        print(f"Decision at threshold {use_thr:.2f}: {'yes' if pred==1 else 'no'}")
        print(f"Using features: {sel_features}")
        print(f"(Adjust class-weight/threshold cells above to change the decision policy)")

control_box = widgets.VBox([widgets_dict[c] for c in sel_features])
display(widgets.VBox([control_box, predict_btn, out]))


VBox(children=(VBox(children=(FloatText(value=1.1, description='emp.var.rate', layout=Layout(width='300px')), …

## Notes
- duration excluded from main models to avoid post-outcome leakage.
- Permutation importance on test set highlights strongest individual predictors; lift table surfaces high-impact client+campaign segments.
- For a stricter combo search, consider pairwise groupby on top 2-3 features or a shallow decision tree with max_depth=3 and reading its rules.
