In [1]:
import numpy as np
import pandas as pd
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer, f1_score
import mlflow
import mlflow.sklearn
from IPython.display import display, Markdown

# Load the dataset
df = pd.read_parquet('../data/data_postcall_encoded.parquet')

# Display the first few rows to ensure the data is loaded correctly
display(Markdown(f'The dataset has {df.shape[0]:,} rows and {df.shape[1]:,} columns.'))
display(Markdown(f'The dataset has the following columns: {", ".join(df.columns)}.'))
display(df.sample(3))
display(df.describe())


The dataset has 26,295 rows and 45 columns.

The dataset has the following columns: y, job_blue-collar, job_entrepreneur, job_housemaid, job_management, job_retired, job_self-employed, job_services, job_student, job_technician, job_unemployed, marital_married, marital_single, education_secondary, education_tertiary, default_yes, housing_yes, loan_yes, contact_telephone, month_aug, month_dec, month_feb, month_jan, month_jul, month_jun, month_mar, month_may, month_nov, month_oct, high_balance, duration_short, duration_medium, duration_long, age_group_18-30, age_group_60+, campaign, age_balance_interaction, young_single_tertiary, long_duration_retired, married_high_balance, age_tertiary_interaction, student_high_balance, retired_high_balance, long_duration_bluecollar, propensity_score.

Unnamed: 0,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,campaign,age_balance_interaction,young_single_tertiary,long_duration_retired,married_high_balance,age_tertiary_interaction,student_high_balance,retired_high_balance,long_duration_bluecollar,propensity_score
19281,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.422234
2325,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.408858
1830,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0.411604


Unnamed: 0,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,campaign,age_balance_interaction,young_single_tertiary,long_duration_retired,married_high_balance,age_tertiary_interaction,student_high_balance,retired_high_balance,long_duration_bluecollar,propensity_score
count,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,...,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0,26295.0
mean,0.08804,0.199544,0.035178,0.027572,0.233162,0.034569,0.038296,0.092261,0.013387,0.185549,...,0.241681,0.004868,0.03514,0.011637,0.248564,0.00232,0.005552,0.015706,0.072067,0.399716
std,0.283358,0.399665,0.184232,0.163746,0.422853,0.18269,0.191914,0.289399,0.114926,0.388749,...,0.42811,0.069601,0.184136,0.107248,0.432189,0.04811,0.074309,0.124339,0.258604,0.065511
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220363
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.378211
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408858
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425006
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.702403


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# Separate features and target variable
X = df.drop('y', axis=1)
y = df['y']

# Encode target variable (yes/no to 1/0)
y = y.map({'yes': 1, 'no': 0})

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
# Apply preprocessing
X_processed = preprocessor.fit_transform(X_train)

X_test_processed = preprocessor.transform(X_test)
df_test_processed = pd.DataFrame(X_test_processed.todense(),
             columns=preprocessor.get_feature_names_out())

# Apply SMOTETomek on the training data
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(
    X_processed, y_train)

df_train_balanced = pd.DataFrame(X_train_balanced.todense(),
             columns=preprocessor.get_feature_names_out())

In [2]:
def objective(params):
    with mlflow.start_run(nested=True):
        # Extract hyperparameters
        n_estimators = int(params['n_estimators'])
        max_depth = int(params['max_depth'])
        min_samples_split = int(params['min_samples_split'])

        # Define model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            random_state=42
        )

        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Log hyperparameters and metrics to MLflow
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_depth)
        mlflow.log_param('min_samples_split', min_samples_split)
        mlflow.log_metric('accuracy', accuracy)
        mlflow.log_metric('f1_score', f1)

        # Log model
        mlflow.sklearn.log_model(model, "model")

        # Return a dictionary with status and loss (to minimize)
        return {'loss': -accuracy, 'status': STATUS_OK}


In [4]:
# Define the search space for Hyperopt
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 200, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
}

In [5]:
# Create a Trials object to store information about the optimization process
trials = Trials()

# Run Hyperopt
best = fmin(
    fn=objective,  # Objective function
    space=space,  # Search space
    algo=tpe.suggest,  # Tree-structured Parzen Estimator
    max_evals=50,  # Number of evaluations (can be adjusted)
    trials=trials  # Store results of each evaluation
)

print(f"Best Hyperparameters: {best}")


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

job exception: name 'X_train' is not defined



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]


NameError: name 'X_train' is not defined

## TPOT

Exploring different models with TPOT

In [None]:
# Tpot model
from tpot import TPOTClassifier
from sklearn.metrics import make_scorer

pathlib.Path("..models/tpot/checkpoints/").mkdir(
    parents=True, exist_ok=True
)

def recall_class_1_scorer(y_true, y_pred):
    print("y_pred distribution:", pd.Series(y_pred).value_counts())  # Logging prediction distribution
    return recall_score(y_true, y_pred, pos_label=1)

# Create a scikit-learn scorer from the custom function
recall_scorer = make_scorer(recall_class_1_scorer)

tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, use_dask=False,
                      periodic_checkpoint_folder="..models/tpot/checkpoints/",
                      log_file="..models/tpot/term_deposit.log",
                      random_state=42, scoring=recall_scorer)

tpot.fit(X_train, y_train)

print(tpot.score(X_test, y_test))

tpot.export('../models/tpot_term_deposit_pipeline.py')

tpot.score(X_test, y_test)
tpot.feature_importances_

# Predict the labels for the test set
y_pred = tpot.predict(X_test)

# Generate and print the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

confusion_matrix(y_test, y_pred)