# Term deposit marketing

## Plan

## Load dataset

In [None]:
import os
import sys
# Change the working directory to the root of the project
while not os.path.isdir('term_deposit'):
    os.chdir("..")
print(f"{os.getcwd()}")

sys.path.append("./term_deposit")
from term_deposit.metrics import CustomRecallFunc, recall_class_1_function

import toml
import pathlib
import pandas as pd

import pycaret.classification as pycaret
from IPython.display import display, Markdown
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


# Load the configuration file
config = toml.load('config.toml')
target = 'y'

cwd=/workspaces/2-term_deposit_marketing/notebooks


## Data processing

In [2]:
# Load the dataset
df = pd.read_parquet(config['paths']["data"]['post'])

# Display basic information about the dataset
df.info(), display(df.sample(3)), display(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 36 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   number_calls         40000 non-null  int64
 1   y                    40000 non-null  int64
 2   job_blue-collar      40000 non-null  uint8
 3   job_entrepreneur     40000 non-null  uint8
 4   job_housemaid        40000 non-null  uint8
 5   job_management       40000 non-null  uint8
 6   job_retired          40000 non-null  uint8
 7   job_self-employed    40000 non-null  uint8
 8   job_services         40000 non-null  uint8
 9   job_student          40000 non-null  uint8
 10  job_technician       40000 non-null  uint8
 11  job_unemployed       40000 non-null  uint8
 12  marital_married      40000 non-null  uint8
 13  marital_single       40000 non-null  uint8
 14  education_secondary  40000 non-null  uint8
 15  education_tertiary   40000 non-null  uint8
 16  default_yes          4

Unnamed: 0,number_calls,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,month_mar,month_may,month_nov,month_oct,high_balance,duration_q1,duration_q2,duration_q3,age_group_18-30,age_group_60+
31455,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
23406,9,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23765,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


Unnamed: 0,number_calls,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,month_mar,month_may,month_nov,month_oct,high_balance,duration_q1,duration_q2,duration_q3,age_group_18-30,age_group_60+
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,...,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,2.882175,0.0724,0.234575,0.035125,0.027175,0.20415,0.035925,0.03535,0.09775,0.0131,...,0.00645,0.3383,0.08995,0.002,0.39795,0.335675,0.331925,0.3324,0.145975,0.00585
std,3.239051,0.259152,0.423738,0.184098,0.162595,0.403084,0.186106,0.184665,0.29698,0.113704,...,0.080053,0.473137,0.286114,0.044677,0.489481,0.472232,0.47091,0.471079,0.353086,0.076262
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
max,63.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


(None, None, None)

### Encoding and balancing

In [3]:
# Define X and y
X = df.drop('y', axis=1)
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df[target], random_state=0)

# Model training

## PyCaret

### setup

In [4]:
# Initialize PyCaret with the balanced data
clf = pycaret.setup(data=X, target=y,**config["pycaret"]["setup"])

Unnamed: 0,Description,Value
0,Session id,123
1,Target,y
2,Target type,Binary
3,Original data shape,"(40000, 36)"
4,Transformed data shape,"(40000, 36)"
5,Transformed train set shape,"(28000, 36)"
6,Transformed test set shape,"(12000, 36)"
7,Numeric features,35
8,Preprocess,True
9,Imputation type,simple


In [5]:
def recall_class_1_function(y_true, y_pred, **kwargs):
    from sklearn.metrics import recall_score
    # Print or log y_pred to check if class 1 is being predicted
    print("y_pred distribution:", pd.Series(y_pred).value_counts())  # Logging prediction distribution
    return recall_score(y_true, y_pred, pos_label=1, **kwargs)

In [6]:
# Add the custom metric to PyCaret
pycaret.add_metric(
    id='recall_class_1',              # Unique identifier for the metric
    name='Recall Class 1',            # Display name
    score_func=recall_class_1_function, # The scoring function defined above
    greater_is_better=True,            # Recall should be maximized
    target='pred',                     # The target is the prediction, not probabilities
    multiclass=False                   # Not using multiclass, focus is on binary recall
)
best_model = pycaret.compare_models(
    sort='Recall Class 1',  # Sort models by the custom recall metric
    verbose=False           # Do not print out all the logs
)

### Tune models

In [7]:
tuned_model = pycaret.tune_model(best_model, optimize='recall_class_1',
                                 custom_scorer='recall_class_1')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Recall Class 1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.725,0.8506,0.8812,0.1926,0.3162,0.2243,0.3269,0.0
1,0.7339,0.8585,0.9158,0.2026,0.3318,0.2423,0.3508,0.0
2,0.7136,0.8319,0.8911,0.1875,0.3098,0.2164,0.3221,0.0
3,0.7164,0.8459,0.9113,0.1925,0.3179,0.2251,0.3345,0.0
4,0.7236,0.8482,0.8621,0.19,0.3114,0.2185,0.3173,0.0
5,0.72,0.8503,0.9015,0.1932,0.3183,0.2258,0.3329,0.0
6,0.7182,0.8468,0.867,0.1876,0.3085,0.2149,0.3151,0.0
7,0.7211,0.8536,0.8867,0.1919,0.3155,0.2229,0.3268,0.0
8,0.72,0.8662,0.9113,0.1945,0.3206,0.2284,0.3375,0.0
9,0.7161,0.8542,0.9015,0.191,0.3152,0.2222,0.3296,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [8]:
final_model = pycaret.finalize_model(tuned_model)

In [9]:
# 2. Predict using the final model
predictions = pycaret.predict_model(final_model, data=X_test)

print(classification_report(y_test, predictions['prediction_label']))
print(roc_auc_score(y_test, predictions['prediction_label']))
print(recall_class_1_function(y_test, predictions['prediction_label']))

confusion_matrix(y_test, predictions['prediction_label'])

# Save the final model
pathlib.Path("../models/pyCaret/term_deposit_model").mkdir(
    parents=True, exist_ok=True
)
pycaret.save_model(final_model, '../models/pyCaret/term_deposit_model')

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      7421
           1       0.65      0.06      0.11       579

    accuracy                           0.93      8000
   macro avg       0.79      0.53      0.54      8000
weighted avg       0.91      0.93      0.90      8000

0.5298079319784982
y_pred distribution: prediction_label
0    7945
1      55
Name: count, dtype: int64
0.06217616580310881
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['number_calls', 'job_blue-collar',
                                              'job_entrepreneur',
                                              'job_housemaid', 'job_management',
                                              'job_retired', 'job_self-employed',
                                              'job_services', 'job_student',
                                              'job_technician', 'job_unemployed',
                                              'marital_married',
                                              'marital_single',
                                              'education_secondary',
                                              'edu...
                  TransformerWrapper(exclude=None, include=None,
                                     transformer=CleanColumnNames(match='[\\]\\[\\,\

## H2O

#### setup

In [10]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()
conf_H2O = config['settings'].get('H2O')

# Convert your balanced train data to H2OFrame
df_train = pd.concat([X_train, y_train], axis=1)  # Combine features and target
df_test = pd.concat([X_test, y_test], axis=1)    # Combine features and target

# Convert your data
train = h2o.H2OFrame(df_train)
test = h2o.H2OFrame(df_test)

# Ensure response is a factor for binary classification
train[target] = train[target].asfactor()
test[target] = test[target].asfactor()


# Define a custom recall function for class 1
class CustomRecallFunc:
    def map(self, pred, act, w, o, model):
        # Calculate recall for class 1
        true_positives = sum((act == 1) & (pred == 1))
        false_negatives = sum((act == 1) & (pred == 0))
        return [true_positives, true_positives + false_negatives]

    def reduce(self, l, r):
        return [l[0] + r[0], l[1] + r[1]]

    def metric(self, l):
        return l[0] / l[1] if l[1] > 0 else 0  # Recall = TP / (TP + FN)

# Upload the custom metric
custom_recall_func = h2o.upload_custom_metric(CustomRecallFunc, func_name="recall_class_1")

# Run AutoML for 20 base models with custom recall metric
aml = H2OAutoML(custom_metric_func=custom_recall_func, **conf_H2O)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "21.0.4" 2024-07-16 LTS; OpenJDK Runtime Environment Microsoft-9889606 (build 21.0.4+7-LTS); OpenJDK 64-Bit Server VM Microsoft-9889606 (build 21.0.4+7-LTS, mixed mode, sharing)
  Starting server from /opt/conda/envs/term_deposit-post/lib/python3.11/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmphsnaf9he
  JVM stdout: /tmp/tmphsnaf9he/h2o_codespace_started_from_python.out
  JVM stderr: /tmp/tmphsnaf9he/h2o_codespace_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 8 days
H2O_cluster_name:,H2O_from_python_codespace_igox3d
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,15.69 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


### Run AutoML with the custom recall function

In [None]:
aml.train(x=list(X_train.columns), y=target, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb = lb.sort("recall_class_1", ascending=False)  # Sort by custom recall metric
display(Markdown('## H2O Leaderboard'))
display(lb.head(rows=lb.nrows))

# Make predictions on the test set
aml.train(x=list(X_train.columns), y=target, training_frame=train, max_runtime_secs=60)
predictions = aml.leader.predict(test)
display(Markdown('## Predictions'))
display(predictions.head())

AutoML progress: |██████████████████████████████████████████████████████████████

### Explain

In [None]:
aml.explain(
    test
)

In [97]:
mc_plot = aml.model_correlation_heatmap(
    frame=test
    )

KeyboardInterrupt: 

In [None]:
# or if some subset of the models is needed a slice of leaderboard can be used, e.g., using MAE as the sorting metric
mc_plot = h2o.model_correlation_heatmap(aml.leaderboard.sort("mae").head(10), test)

In [None]:
# or even extended leaderboard can be used
mc_plot = h2o.model_correlation_heatmap(h2o.automl.get_leaderboard(aml, extra_columns="training_time_ms").sort("training_time_ms").head(10), test)

In [None]:
# also more complicated queries on leaderboard can be used, e.g., model correlation between 5 fastest models to train and Stacked Ensembles
leaderboard = h2o.automl.get_leaderboard(aml, extra_columns="training_time_ms").sort("training_time_ms")
mc_plot = h2o.model_correlation_heatmap(leaderboard.head(5).rbind(leaderboard[leaderboard["model_id"].grep("StackedEnsemble", output_logical=True)]), test)

In [14]:
aml.leader.fair_shap_plot(frame=test,
                          column='target',
                          protected_columns=["None"])

AttributeError: 'NoneType' object has no attribute 'fair_shap_plot'