# Train/Eval split (random stratified 80/20)
Loading the full dataset and creating train/eval pandas DataFrames.


In [66]:
import pandas as pd
from pathlib import Path

data_path = Path('Data/bank-additional-full.csv')
df = pd.read_csv(data_path, sep=';')
df.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [67]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(
    df, test_size=0.2, stratify=df["y"], random_state=42
)

print(f"Train rows: {len(train_df)}  Eval rows: {len(eval_df)}")
print('Train y counts: ', train_df['y'].value_counts())
print('Eval y counts: ', eval_df['y'].value_counts())


Train rows: 32950  Eval rows: 8238
Train y counts:  y
no     29238
yes     3712
Name: count, dtype: int64
Eval y counts:  y
no     7310
yes     928
Name: count, dtype: int64


In [68]:
# Split sanity checks
print('Train months:')
display(train_df['month'].value_counts())
print('Eval months:')
display(eval_df['month'].value_counts())


Train months:


month
may    11011
jul     5763
aug     4948
jun     4247
nov     3266
apr     2085
oct      587
sep      464
mar      436
dec      143
Name: count, dtype: int64

Eval months:


month
may    2758
jul    1411
aug    1230
jun    1071
nov     835
apr     547
oct     131
mar     110
sep     106
dec      39
Name: count, dtype: int64

## Model: predict high vs low response
Using logistic regression with one-hot encoding to classify clients as high-response (`yes`) or low-response (`no`).


In [69]:
# Prepare features/target
target_map = {'yes': 1, 'no': 0}
y_train = train_df['y'].map(target_map)
y_eval = eval_df['y'].map(target_map)

X_train = train_df.drop(columns=['y'])
X_eval = eval_df.drop(columns=['y'])

cat_cols = [c for c in X_train.columns if X_train[c].dtype == 'object']
num_cols = [c for c in X_train.columns if X_train[c].dtype != 'object']
print(f"Categorical cols: {len(cat_cols)}  Numeric cols: {len(num_cols)}")
for n in num_cols:
    print(n)
print("Categorical cols:")
for c in cat_cols:
    print(c)

Categorical cols: 10  Numeric cols: 10
age
duration
campaign
pdays
previous
emp.var.rate
cons.price.idx
cons.conf.idx
euribor3m
nr.employed
Categorical cols:
job
marital
education
default
housing
loan
contact
month
day_of_week
poutcome


In [70]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

preprocess = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('numeric', 'passthrough', num_cols),
    ],
    remainder='drop'
)

clf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=500, class_weight='balanced', n_jobs=-1)),
])
clf


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,500


In [71]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

clf.fit(X_train, y_train)

eval_pred = clf.predict(X_eval)
eval_proba = clf.predict_proba(X_eval)[:, 1]

print(f'Eval ROC-AUC: {roc_auc_score(y_eval, eval_proba):.3f}')
print('Classification report (eval):')
print(classification_report(y_eval, eval_pred, target_names=['low-response', 'high-response']))

cm = pd.DataFrame(
    confusion_matrix(y_eval, eval_pred),
    index=['true low', 'true high'],
    columns=['pred low', 'pred high'],
)
cm


Eval ROC-AUC: 0.942
Classification report (eval):
               precision    recall  f1-score   support

 low-response       0.99      0.86      0.92      7310
high-response       0.44      0.91      0.60       928

     accuracy                           0.86      8238
    macro avg       0.72      0.88      0.76      8238
 weighted avg       0.93      0.86      0.88      8238



Unnamed: 0,pred low,pred high
true low,6252,1058
true high,82,846


In [72]:
# Score the eval set to prioritize calls
scored_eval = eval_df.copy()
scored_eval['high_response_score'] = eval_proba
scored_eval[['high_response_score', 'y']].describe()


Unnamed: 0,high_response_score
count,8238.0
mean,0.26956
std,0.31605
min,0.003136
25%,0.040548
50%,0.095912
75%,0.443857
max,1.0


In [73]:
# Top 5 likely responders
scored_eval.sort_values('high_response_score', ascending=False)[['high_response_score', 'y', 'job', 'education', 'contact']]


# create a GUI that lets people input features and get a prediction from the model


Unnamed: 0,high_response_score,y,job,education,contact
7727,1.000000,yes,unemployed,professional.course,telephone
20996,1.000000,no,admin.,high.school,cellular
39171,1.000000,yes,admin.,university.degree,cellular
10456,1.000000,yes,blue-collar,basic.4y,telephone
18284,1.000000,yes,admin.,university.degree,telephone
...,...,...,...,...,...
4036,0.004551,no,blue-collar,basic.9y,telephone
4114,0.003823,no,entrepreneur,university.degree,telephone
5564,0.003667,no,admin.,university.degree,telephone
4139,0.003609,no,management,basic.4y,telephone


## Validation: cross-validation, lift, threshold
Extra checks to verify high/low-response predictions are meaningful. Run after fitting the model.


In [None]:
# Stratified CV on train set to check stability
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import precision_recall_curve, auc

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_proba = cross_val_predict(clf, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
cv_pred = (cv_proba >= 0.5).astype(int)

print("Train CV ROC-AUC:", roc_auc_score(y_train, cv_proba))
print("Train CV classification report:")
print(classification_report(y_train, cv_pred, target_names=["low-response", "high-response"]))


In [None]:
# Lift / decile analysis on eval set
import numpy as np

scored_eval = eval_df.copy()
scored_eval['high_response_score'] = eval_proba

scored_eval = scored_eval.sort_values("high_response_score", ascending=False)
scored_eval["decile"] = pd.qcut(scored_eval["high_response_score"], 10, labels=False, duplicates="drop")

lift = scored_eval.groupby("decile")["y"].apply(lambda s: (s == "yes").mean()).sort_index(ascending=False)
baseline_rate = (eval_df["y"] == "yes").mean()
lift_df = pd.DataFrame({
    "response_rate": lift,
    "lift_vs_baseline": lift / baseline_rate
})
print("Baseline eval response rate:", baseline_rate)
lift_df


In [None]:
# Threshold tuning helper: choose cutoff to meet call capacity
call_capacity = 0.2  # fraction of eval set you can call; adjust as needed
cutoff = scored_eval["high_response_score"].quantile(1 - call_capacity)
print(f"Calling top {call_capacity*100:.0f}% => score cutoff {cutoff:.3f}")

subset = scored_eval[scored_eval["high_response_score"] >= cutoff]
precision = (subset["y"] == "yes").mean()
recall = (subset["y"] == "yes").sum() / (eval_df["y"] == "yes").sum()
print(f"Precision among called: {precision:.3f}")
print(f"Recall of all responders captured: {recall:.3f}")


## Model comparison: KNN
K-nearest neighbors with one-hot encoded categories and scaling for comparison to logistic regression.


In [76]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Pipeline: one-hot categorical + passthrough numeric (scaled) for KNN
knn_preprocess = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('numeric', StandardScaler(with_mean=False), num_cols),
    ],
    remainder='drop'
)

knn_clf = Pipeline(steps=[
    ('preprocess', knn_preprocess),
    ('model', KNeighborsClassifier(n_neighbors=10, weights='distance', n_jobs=-1)),
])
knn_clf


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,False
,with_std,True

0,1,2
,n_neighbors,10
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,-1


In [77]:
# Fit/evaluate KNN on the same split
knn_clf.fit(X_train, y_train)
knn_pred = knn_clf.predict(X_eval)
knn_proba = knn_clf.predict_proba(X_eval)[:, 1]

print(f'KNN Eval ROC-AUC: {roc_auc_score(y_eval, knn_proba):.3f}')
print('Classification report (eval):')
print(classification_report(y_eval, knn_pred, target_names=['low-response', 'high-response']))

knn_cm = pd.DataFrame(
    confusion_matrix(y_eval, knn_pred),
    index=['true low', 'true high'],
    columns=['pred low', 'pred high'],
)
knn_cm


KNN Eval ROC-AUC: 0.910
Classification report (eval):
               precision    recall  f1-score   support

 low-response       0.93      0.97      0.95      7310
high-response       0.65      0.42      0.51       928

     accuracy                           0.91      8238
    macro avg       0.79      0.70      0.73      8238
 weighted avg       0.90      0.91      0.90      8238



Unnamed: 0,pred low,pred high
true low,7098,212
true high,539,389


## Model comparison: Decision Tree
Decision Tree with one-hot encoded categorical features for comparison.


In [78]:
from sklearn.tree import DecisionTreeClassifier

# One-hot encode categorical, passthrough numeric (no scaling needed for trees)
tree_preprocess = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('numeric', 'passthrough', num_cols),
    ],
    remainder='drop'
)

tree_clf = Pipeline(steps=[
    ('preprocess', tree_preprocess),
    ('model', DecisionTreeClassifier(
        max_depth=None,
        min_samples_leaf=20,
        class_weight='balanced',
        random_state=42,
    )),
])
tree_clf


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,20
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [79]:
# Fit/evaluate Decision Tree on the same split
tree_clf.fit(X_train, y_train)
tree_pred = tree_clf.predict(X_eval)
tree_proba = tree_clf.predict_proba(X_eval)[:, 1]

print(f'Decision Tree Eval ROC-AUC: {roc_auc_score(y_eval, tree_proba):.3f}')
print('Classification report (eval):')
print(classification_report(y_eval, tree_pred, target_names=['low-response', 'high-response']))

tree_cm = pd.DataFrame(
    confusion_matrix(y_eval, tree_pred),
    index=['true low', 'true high'],
    columns=['pred low', 'pred high'],
)
tree_cm


Decision Tree Eval ROC-AUC: 0.927
Classification report (eval):
               precision    recall  f1-score   support

 low-response       0.99      0.85      0.91      7310
high-response       0.44      0.90      0.59       928

     accuracy                           0.86      8238
    macro avg       0.71      0.88      0.75      8238
 weighted avg       0.92      0.86      0.88      8238



Unnamed: 0,pred low,pred high
true low,6236,1074
true high,89,839


## Model comparison: Naive Bayes
Gaussian Naive Bayes on one-hot encoded features for comparison (dense input).


In [81]:
from sklearn.naive_bayes import GaussianNB

# One-hot encode categorical as dense + passthrough numeric for Naive Bayes
nb_preprocess = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('numeric', 'passthrough', num_cols),
    ],
    remainder='drop'
)

nb_clf = Pipeline(steps=[
    ('preprocess', nb_preprocess),
    ('model', GaussianNB()),
])
nb_clf


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,priors,
,var_smoothing,1e-09


In [82]:
# Fit/evaluate Naive Bayes on the same split
nb_clf.fit(X_train, y_train)
nb_pred = nb_clf.predict(X_eval)
nb_proba = nb_clf.predict_proba(X_eval)[:, 1]

print(f'Naive Bayes Eval ROC-AUC: {roc_auc_score(y_eval, nb_proba):.3f}')
print('Classification report (eval):')
print(classification_report(y_eval, nb_pred, target_names=['low-response', 'high-response']))

nb_cm = pd.DataFrame(
    confusion_matrix(y_eval, nb_pred),
    index=['true low', 'true high'],
    columns=['pred low', 'pred high'],
)
nb_cm


Naive Bayes Eval ROC-AUC: 0.840
Classification report (eval):
               precision    recall  f1-score   support

 low-response       0.94      0.91      0.92      7310
high-response       0.43      0.53      0.48       928

     accuracy                           0.87      8238
    macro avg       0.68      0.72      0.70      8238
 weighted avg       0.88      0.87      0.87      8238



Unnamed: 0,pred low,pred high
true low,6658,652
true high,435,493
