# Term deposit marketing

## Plan

## Load dataset

In [1]:
import pathlib

import toml
import pandas as pd

from imblearn.combine import SMOTETomek
from IPython.display import display, Markdown
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report, roc_auc_score, confusion_matrix

# Load the configuration file
config = toml.load('../config.toml')

# Accessing values
path_encoded_data = config['paths']['encoded_data']
path_raw_data = config['paths']['raw_data']
output_dir = config['paths']['output_dir']

In [2]:
# Load the dataset
df = pd.read_parquet(config['paths']['raw_pdata'])

# Display basic information about the dataset
df.info(), df.head(), df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  int64 
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  object
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  y          40000 non-null  object
dtypes: int64(5), object(9)
memory usage: 4.3+ MB


(None,
    age           job  marital  education default  balance housing loan  \
 0   58    management  married   tertiary      no     2143     yes   no   
 1   44    technician   single  secondary      no       29     yes   no   
 2   33  entrepreneur  married  secondary      no        2     yes  yes   
 3   47   blue-collar  married    unknown      no     1506     yes   no   
 4   33       unknown   single    unknown      no        1      no   no   
 
    contact  day month  duration  campaign   y  
 0  unknown    5   may       261         1  no  
 1  unknown    5   may       151         1  no  
 2  unknown    5   may        76         1  no  
 3  unknown    5   may        92         1  no  
 4  unknown    5   may       198         1  no  ,
                 age        balance           day      duration      campaign
 count  40000.000000   40000.000000  40000.000000  40000.000000  40000.000000
 mean      40.544600    1274.277550     16.017225    254.824300      2.882175
 std        

In [3]:
# Define a custom recall function for class 1 with logging
def recall_class_1_function(y_true, y_pred, **kwargs):
    # Print or log y_pred to check if class 1 is being predicted
    print("y_pred distribution:", pd.Series(y_pred).value_counts())  # Logging prediction distribution
    return recall_score(y_true, y_pred, pos_label=1, **kwargs)

## Encoding and balancing

In [4]:
target = 'y'
# Encode target variable (yes/no to 1/0)
df[target] = df[target].map({'yes': 1, 'no': 0})

# Define X and y
X = df.drop('y', axis=1)
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df[target], random_state=0)

## PyCaret

In [5]:
import pycaret.classification as pycaret

# Initialize PyCaret with the balanced data
clf = pycaret.setup(data=X, target=y,
            use_gpu=True, session_id=123,
            log_experiment = True, experiment_name = 'predict-term_deposit-marketing-2020',
            fix_imbalance_method="smote_tomek")

  File "c:\Users\Guill\miniconda3\envs\term_deposit\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> i

Unnamed: 0,Description,Value
0,Session id,123
1,Target,y
2,Target type,Binary
3,Original data shape,"(40000, 14)"
4,Transformed data shape,"(40000, 42)"
5,Transformed train set shape,"(28000, 42)"
6,Transformed test set shape,"(12000, 42)"
7,Numeric features,5
8,Categorical features,8
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


2024/09/21 00:50:57 INFO mlflow.tracking.fluent: Experiment with name 'predict-term_deposit-marketing-2020' does not exist. Creating a new experiment.


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [6]:
# Add the custom metric to PyCaret
pycaret.add_metric(
    id='recall_class_1',              # Unique identifier for the metric
    name='Recall Class 1',            # Display name
    score_func=recall_class_1_function, # The scoring function defined above
    greater_is_better=True,            # Recall should be maximized
    target='pred',                     # The target is the prediction, not probabilities
    multiclass=False                   # Not using multiclass, focus is on binary recall
)
best_model = pycaret.compare_models(
    sort='Recall Class 1',  # Sort models by the custom recall metric
    verbose=False           # Do not print out all the logs
)



In [66]:
tuned_model = pycaret.tune_model(best_model, optimize='recall_class_1',
                                 custom_scorer='recall_class_1',
                                #  threshold=0.3
                                 )

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Recall Class 1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.82,0.8584,0.7277,0.2466,0.3684,0.2921,0.3507,0.7277
1,0.83,0.8789,0.7871,0.2686,0.4005,0.3282,0.3931,0.7871
2,0.8282,0.8517,0.7079,0.2531,0.3729,0.2983,0.3517,0.7079
3,0.8218,0.8959,0.7833,0.259,0.3892,0.3145,0.3811,0.7833
4,0.8257,0.8775,0.7635,0.2605,0.3885,0.3143,0.3766,0.7635
5,0.845,0.9047,0.803,0.2926,0.4289,0.361,0.423,0.803
6,0.8246,0.8797,0.7882,0.2632,0.3946,0.3207,0.3872,0.7882
7,0.8311,0.8829,0.7783,0.2696,0.4005,0.3282,0.3911,0.7783
8,0.8246,0.8743,0.7537,0.2576,0.3839,0.3093,0.3704,0.7537
9,0.8246,0.8736,0.7537,0.2576,0.3839,0.3093,0.3704,0.7537


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).




In [67]:
final_model = pycaret.finalize_model(tuned_model)



In [68]:
# 2. Predict using the final model
predictions = pycaret.predict_model(final_model, data=X_test)

print(classification_report(y_test, predictions['prediction_label']))
print(roc_auc_score(y_test, predictions['prediction_label']))
print(recall_class_1_function(y_test, predictions['prediction_label']))

confusion_matrix(y_test, predictions['prediction_label'])

# Save the final model
pathlib.Path("../models/pyCaret/term_deposit_model").mkdir(
    parents=True, exist_ok=True
)
pycaret.save_model(final_model, '../models/pyCaret/term_deposit_model')

              precision    recall  f1-score   support

           0       0.97      0.44      0.61      7421
           1       0.10      0.81      0.18       579

    accuracy                           0.47      8000
   macro avg       0.53      0.63      0.39      8000
weighted avg       0.90      0.47      0.57      8000

0.6253967234373629
y_pred distribution: 1    4619
0    3381
Name: prediction_label, dtype: int64
0.8100172711571675
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['age', 'balance', 'day',
                                              'duration', 'campaign'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None,
                                     include...
                                     include=['job', 'marital', 'education',
                 

## H2O

In [7]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()
conf_H2O = config['settings'].get('H2O')

# Convert your balanced train data to H2OFrame
df_train = pd.concat([X_train, y_train], axis=1)  # Combine features and target
df_test = pd.concat([X_test, y_test], axis=1)    # Combine features and target

# Define target variable
y = 'target'

# Convert your data
train = h2o.H2OFrame(df_train)
test = h2o.H2OFrame(df_test)

# Ensure response is a factor for binary classification
train[target] = train[target].asfactor()
test[target] = test[target].asfactor()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 22.0.2+9-70, mixed mode, sharing)
  Starting server from C:\Users\Guill\miniconda3\envs\term_deposit\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Guill\AppData\Local\Temp\tmpakhicg3y
  JVM stdout: C:\Users\Guill\AppData\Local\Temp\tmpakhicg3y\h2o_Guill_started_from_python.out
  JVM stderr: C:\Users\Guill\AppData\Local\Temp\tmpakhicg3y\h2o_Guill_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Mexico_City
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,22 days
H2O_cluster_name:,H2O_from_python_Guill_5zcfko
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.839 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [95]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.metrics import recall_score

# Initialize H2O
h2o.init()

# Define a custom recall function for class 1
class CustomRecallFunc:
    def map(self, pred, act, w, o, model):
        # Calculate recall for class 1
        true_positives = sum((act == 1) & (pred == 1))
        false_negatives = sum((act == 1) & (pred == 0))
        return [true_positives, true_positives + false_negatives]

    def reduce(self, l, r):
        return [l[0] + r[0], l[1] + r[1]]

    def metric(self, l):
        return l[0] / l[1] if l[1] > 0 else 0  # Recall = TP / (TP + FN)

# Upload the custom metric
custom_recall_func = h2o.upload_custom_metric(CustomRecallFunc, func_name="recall_class_1")

# Run AutoML for 20 base models with custom recall metric
aml = H2OAutoML(custom_metric_func=custom_recall_func, **conf_H2O)
aml.train(x=list(X_train.columns), y=target, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb = lb.sort("recall_class_1", ascending=False)  # Sort by custom recall metric
display(Markdown('## H2O Leaderboard'))
display(lb.head(rows=lb.nrows))

# Make predictions on the test set
predictions = aml.leader.predict(test)
display(Markdown('## Predictions'))
display(predictions.head())

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,19 secs
H2O_cluster_timezone:,America/Mexico_City
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,22 days
H2O_cluster_name:,H2O_from_python_Guill_5zcfko
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.318 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


AutoML progress: |█
00:55:08.79: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████

In [97]:
mc_plot = aml.model_correlation_heatmap(
    frame=test
    )

KeyboardInterrupt: 

In [None]:
# or if some subset of the models is needed a slice of leaderboard can be used, e.g., using MAE as the sorting metric
mc_plot = h2o.model_correlation_heatmap(aml.leaderboard.sort("mae").head(10), test)

In [None]:
# or even extended leaderboard can be used
mc_plot = h2o.model_correlation_heatmap(h2o.automl.get_leaderboard(aml, extra_columns="training_time_ms").sort("training_time_ms").head(10), test)

In [None]:
# also more complicated queries on leaderboard can be used, e.g., model correlation between 5 fastest models to train and Stacked Ensembles
leaderboard = h2o.automl.get_leaderboard(aml, extra_columns="training_time_ms").sort("training_time_ms")
mc_plot = h2o.model_correlation_heatmap(leaderboard.head(5).rbind(leaderboard[leaderboard["model_id"].grep("StackedEnsemble", output_logical=True)]), test)

In [213]:
aml.leader.fair_shap_plot(frame=test,
                          column='target',
                          protected_columns=["None"])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: Column None not found
  Request: POST /99/Rapids
    data: {'ast': "(tmp= py_18_sid_ab3e (unique (cols_py Key_Frame__upload_984acee654effc263cd4fb4f81a44444.hex 'None') False))", 'session_id': '_sid_ab3e'}
