# Term deposit marketing

## Plan

## Load dataset

In [None]:
import toml

# Load the configuration file
config = toml.load('../config.toml')

# Accessing values
path_encoded_data = config['paths']['encoded_data']
path_raw_data = config['paths']['raw_data']
output_dir = config['paths']['output_dir']

In [49]:
import pandas as pd
from IPython.display import display, Markdown


# Load the dataset
file_path = '../data/term-deposit-marketing-2020.csv'
df = pd.read_csv(file_path)

# Display basic information about the dataset
df.info(), df.head(), df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  int64 
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  object
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  y          40000 non-null  object
dtypes: int64(5), object(9)
memory usage: 4.3+ MB


(None,
    age           job  marital  education default  balance housing loan  \
 0   58    management  married   tertiary      no     2143     yes   no   
 1   44    technician   single  secondary      no       29     yes   no   
 2   33  entrepreneur  married  secondary      no        2     yes  yes   
 3   47   blue-collar  married    unknown      no     1506     yes   no   
 4   33       unknown   single    unknown      no        1      no   no   
 
    contact  day month  duration  campaign   y  
 0  unknown    5   may       261         1  no  
 1  unknown    5   may       151         1  no  
 2  unknown    5   may        76         1  no  
 3  unknown    5   may        92         1  no  
 4  unknown    5   may       198         1  no  ,
                 age        balance           day      duration      campaign
 count  40000.000000   40000.000000  40000.000000  40000.000000  40000.000000
 mean      40.544600    1274.277550     16.017225    254.824300      2.882175
 std        

## Encoding and balancing

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Separate features and target variable
X = df.drop('y', axis=1)
y = df['y']

# Encode target variable (yes/no to 1/0)
y = y.map({'yes': 1, 'no': 0})

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

df_test_processed = pd.DataFrame(X_test_processed.todense(),
             columns=preprocessor.get_feature_names_out())


# Apply preprocessing
X_processed = preprocessor.fit_transform(X_train)

# Split data into training and test sets
from imblearn.combine import SMOTETomek

# Apply SMOTETomek on the training data
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(
    X_processed, y_train)

df_train_balanced = pd.DataFrame(X_train_balanced.todense(),
             columns=preprocessor.get_feature_names_out())

## PyCaret

In [52]:
from pycaret.classification import *

# Initialize PyCaret with the balanced data
clf = setup(data=X_train_balanced, target=y_train_balanced,
            gpu=True, session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,y
2,Target type,Binary
3,Original data shape,"(59314, 45)"
4,Transformed data shape,"(59314, 45)"
5,Transformed train set shape,"(41519, 45)"
6,Transformed test set shape,"(17795, 45)"
7,Numeric features,44
8,Preprocess,True
9,Imputation type,simple


In [53]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9757,0.9979,0.9892,0.9633,0.9761,0.9515,0.9518,6.127
rf,Random Forest Classifier,0.9683,0.9966,0.9847,0.9534,0.9688,0.9366,0.9371,3.551
xgboost,Extreme Gradient Boosting,0.9644,0.9958,0.967,0.962,0.9645,0.9288,0.9288,0.487
lightgbm,Light Gradient Boosting Machine,0.9606,0.9953,0.9689,0.9531,0.961,0.9212,0.9214,0.687
knn,K Neighbors Classifier,0.9383,0.975,0.9979,0.8917,0.9418,0.8766,0.8829,5.73
dt,Decision Tree Classifier,0.9375,0.9375,0.9459,0.9303,0.938,0.875,0.8751,0.309
gbc,Gradient Boosting Classifier,0.9326,0.9829,0.9559,0.9135,0.9342,0.8653,0.8662,1.676
ada,Ada Boost Classifier,0.9113,0.969,0.9183,0.9056,0.9119,0.8225,0.8227,0.639
lr,Logistic Regression,0.8802,0.9395,0.8894,0.8733,0.8813,0.7604,0.7605,2.251
svm,SVM - Linear Kernel,0.8794,0.9375,0.9028,0.8627,0.8821,0.7587,0.76,0.125


In [54]:
tuned_model = tune_model(best_model, optimize='AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8882,0.9552,0.8988,0.8802,0.8894,0.7765,0.7767
1,0.8964,0.9613,0.908,0.8875,0.8976,0.7929,0.7931
2,0.8885,0.953,0.907,0.8746,0.8905,0.777,0.7775
3,0.8902,0.9535,0.9056,0.8785,0.8918,0.7803,0.7807
4,0.8815,0.9472,0.894,0.8722,0.883,0.763,0.7632
5,0.8919,0.9564,0.8945,0.8898,0.8921,0.7837,0.7837
6,0.8772,0.9495,0.8984,0.8618,0.8797,0.7543,0.755
7,0.8825,0.9452,0.8998,0.8696,0.8845,0.7649,0.7654
8,0.8846,0.9509,0.8984,0.8744,0.8862,0.7693,0.7696
9,0.8836,0.9518,0.8931,0.8766,0.8848,0.7673,0.7674


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [55]:
final_model = finalize_model(tuned_model)

In [73]:
# 1. Process the test data using the preprocessor
X_test_processed = preprocessor.transform(X_test)

# 2. Predict using the final model
predictions = predict_model(final_model,
                            data=pd.DataFrame(X_test_processed.todense(),
                                              columns = [f'feature_{i}' for i in range(1, 45)]))

In [129]:
# 3. Evaluate the model
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, predictions['prediction_label']))
print(roc_auc_score(y_test, predictions['prediction_label']))

confusion_matrix(y_test, predictions['prediction_label'])
# Save the final model
# save_model(final_model, '../models/term_deposit_model')

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      7424
           1       0.53      0.41      0.46       576

    accuracy                           0.93      8000
   macro avg       0.74      0.69      0.71      8000
weighted avg       0.92      0.93      0.93      8000

0.6929178639846744


array([[7208,  216],
       [ 337,  239]], dtype=int64)

## TPOT

Exploring different models with TPOT

In [112]:
# Tpot model
from tpot import TPOTClassifier


tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,
                      random_state=42, scoring='roc_auc_ovo_weighted')

tpot.fit(df_train_balanced, y_train_balanced, use_dask=True)

print(tpot.score(df_test_processed, y_test))

tpot.export('../models/tpot_term_deposit_pipeline.py')

0.9438088369552203


In [123]:
tpot.score(df_test_processed, y_test)

0.9438088369552203

In [1]:
tpot.feature_importances_

NameError: name 'tpot' is not defined

In [126]:
# Predict the labels for the test set
y_pred = tpot.predict(df_test_processed)

# Generate and print the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

# confussion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      7424
           1       0.51      0.56      0.53       576

    accuracy                           0.93      8000
   macro avg       0.74      0.76      0.75      8000
weighted avg       0.93      0.93      0.93      8000



array([[7119,  305],
       [ 256,  320]], dtype=int64)

## H2O

In [227]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()
conf_H2O = config['settings'].get('H2O')

# Convert your balanced train data to H2OFrame
x = df_train_balanced.columns
y = 'target'
df_train_balanced[y] = y_train_balanced
df_test_processed[y] = y_test

# Convert your data
train = h2o.H2OFrame(df_train_balanced)
test = h2o.H2OFrame(df_test_processed)

# Ensure response is a factor for binary classification
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,9 hours 18 mins
H2O_cluster_timezone:,America/Mexico_City
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,2 days
H2O_cluster_name:,H2O_from_python_Guill_ue6mzn
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.545 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [234]:
# Run AutoML for 20 base models
aml = H2OAutoML(**conf_H2O)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
# lb = aml.leaderboard
lb = lb.sort("auc", ascending=False)
display(Markdown('## AutoML Leaderboard'))
display(lb.head(rows=lb.nrows))

# Make predictions on the test set
predictions = aml.leader.predict(h2o.H2OFrame(df_test_processed))
display(Markdown('## Predictions'))
display(predictions.head())

## AutoML Leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_1_AutoML_1_20240901_02633,0.997767,0.0616004,0.99787,0.0252554,0.135385,0.018329
StackedEnsemble_BestOfFamily_1_AutoML_1_20240901_02633,0.997611,0.0640649,0.997667,0.024952,0.136746,0.0186996
GBM_grid_1_AutoML_1_20240901_02633_model_5,0.997564,0.0687213,0.997619,0.0254915,0.142319,0.0202547
GBM_grid_1_AutoML_1_20240901_02633_model_1,0.997237,0.0676257,0.997407,0.0274303,0.141003,0.019882
GBM_grid_1_AutoML_1_20240901_02633_model_4,0.99719,0.0675724,0.99737,0.0282058,0.142434,0.0202873
GBM_4_AutoML_1_20240901_02633,0.997093,0.0706389,0.997237,0.0284081,0.145338,0.021123
GBM_1_AutoML_1_20240901_02633,0.996886,0.0723143,0.997038,0.0301109,0.147442,0.0217392
GBM_3_AutoML_1_20240901_02633,0.996695,0.0742823,0.9969,0.0304818,0.148807,0.0221436
GBM_2_AutoML_1_20240901_02633,0.996465,0.0770773,0.996691,0.0314091,0.151349,0.0229066
GBM_5_AutoML_1_20240901_02633,0.99618,0.0801592,0.996411,0.032387,0.153763,0.0236431


AttributeError: 'NoneType' object has no attribute 'predict'

In [None]:
h2o.H2OFrame(df_test_processed.assign(target=y_test))

In [218]:
mc_plot = aml.model_correlation_heatmap(h2o.H2OFrame(df_test_processed.assign(target=y_test)))

# or if some subset of the models is needed a slice of leaderboard can be used, e.g., using MAE as the sorting metric
mc_plot = h2o.model_correlation_heatmap(aml.leaderboard.sort("mae").head(10), test)

# or even extended leaderboard can be used
mc_plot = h2o.model_correlation_heatmap(h2o.automl.get_leaderboard(aml, extra_columns="training_time_ms").sort("training_time_ms").head(10), test)

# also more complicated queries on leaderboard can be used, e.g., model correlation between 5 fastest models to train and Stacked Ensembles
leaderboard = h2o.automl.get_leaderboard(aml, extra_columns="training_time_ms").sort("training_time_ms")
mc_plot = h2o.model_correlation_heatmap(leaderboard.head(5).rbind(leaderboard[leaderboard["model_id"].grep("StackedEnsemble", output_logical=True)]), test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


OSError: Job with key $03017f00000132d4ffffffff$_b3212a25e4309573412ce2a71f6a46c4 failed with an exception: java.lang.IllegalArgumentException: Test/Validation dataset has a categorical response column 'target' with no levels in common with the model
stacktrace: 
java.lang.IllegalArgumentException: Test/Validation dataset has a categorical response column 'target' with no levels in common with the model
	at hex.Model.adaptTestForTrain(Model.java:1825)
	at hex.Model.adaptTestForTrain(Model.java:1643)
	at hex.Model.adaptTestForTrain(Model.java:1639)
	at hex.Model.adaptFrameForScore(Model.java:1981)
	at hex.Model.score(Model.java:1999)
	at water.api.ModelMetricsHandler$1.compute2(ModelMetricsHandler.java:555)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1704)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:976)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1479)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)


In [213]:
aml.leader.fair_shap_plot(frame=h2o.H2OFrame(df_test_processed.assign(target=y_test)),
                          column='target',
                          protected_columns=["None"])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: Column None not found
  Request: POST /99/Rapids
    data: {'ast': "(tmp= py_18_sid_ab3e (unique (cols_py Key_Frame__upload_984acee654effc263cd4fb4f81a44444.hex 'None') False))", 'session_id': '_sid_ab3e'}
