In [1]:
import time
start_time = time.time()

In [2]:
# Checking Python's version
!python -V

Python 3.9.16


# Importation

In [3]:
# Importing libraries

import pickle
import mlflow
import numpy as np
import pandas as pd
import xgboost as xgb
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from hyperopt.pyll import scope
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (f1_score,
                             recall_score,
                             roc_auc_score,
                             accuracy_score, 
                             precision_score)

In [4]:
# Read training and validation data
df_train = pd.read_csv('./data/training_data.csv')
df_val = pd.read_csv('./data/validation_data.csv')

In [5]:
# Creating target variable 
y_train = df_train['Response']
y_val = df_val['Response']

# Data Preprocessing

In [6]:
# Create the necessary variables
dependants = ['Kidhome', 'Teenhome']

# assuming analysis was conducted in 2014 
now = 2014

# Define the bin edges
bins = [18, 28, 38, 48, 58, 65, np.inf]

# Define the labels for each age group
labels = ['18-27', '28-37', '38-47', '48-57', '58-65', '65+']

# End of financial year
end_fiscal = datetime(2014, 6, 30)

# Redundant features
red_ftrs_1 = ["ID", "Year_Birth", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Response",'Age']

# List of categorical and numeric features
categ_ftrs_1 = ['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Age_Group']

num_ftrs_1 = ['Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Onboard_Days']

In [7]:
# Function to do data cleaning and feature preprocessing
def scrub_data(df):
    
    # Convert 'Kidhome' and 'Teenhome' to categorical
    # but first fillna with the most frequent value
    df[dependants] = df[dependants].fillna(df[dependants].mode().iloc[0])
    df[dependants] = df[dependants].applymap(lambda x: 1 if x > 0 else 0)
    
    # Conversions into 'datetime' data type
    # but first fillna in both variables
    df['Year_Birth'] = df['Year_Birth'].fillna(int(df['Year_Birth'].median()))
    df['Year_Birth'] = pd.to_datetime(df['Year_Birth'], format='%Y')
    
    df['Dt_Customer'] = df['Dt_Customer'].fillna(df['Dt_Customer'].mode().iloc[0])
    df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"])
    
    # Calculate age
    df['Age'] = now - df['Year_Birth'].dt.year
    
    # Create age group feature
    df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    
    # Calculate the number of days since customer enrolled
    df['Onboard_Days'] = (end_fiscal - df['Dt_Customer']).dt.days
    
    # Droping redundant features
    df = df.drop(red_ftrs_1, axis=1)
    
    # handle missing values and scale numeric data
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('normalize', PowerTransformer(method='yeo-johnson')),
    ])
    
    ct = ColumnTransformer([
        ('num_trans', num_transformer, num_ftrs_1),
        ('cat_trans', SimpleImputer(strategy='most_frequent'), categ_ftrs_1)
    ])
        
    df = pd.DataFrame(ct.fit_transform(df), 
                      columns=num_ftrs_1+categ_ftrs_1)
    
    # Ensure that the final df features are in the right data types
    df[categ_ftrs_1] = df[categ_ftrs_1].astype('str')
    df[num_ftrs_1] = df[num_ftrs_1].astype('float')
     
    return df

In [8]:
# Clean and preprocess the data ones
train_data = scrub_data(df_train)
val_data = scrub_data(df_val)

In [9]:
# From DataFrame to dictionary
train_dicts= train_data.to_dict(orient='records')
val_dicts = val_data.to_dict(orient='records')

In [10]:
# Use dictionary vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

# Preparing for MLFlow

In [11]:
# Setting tracking uri (unique resource identifier)
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [12]:
def eval_metrics_logs(y_true, y_pred):
    
    # Calcualte the evaluation metrics
    metrics = {
        'f1': f1_score(y_val, y_pred.round()), 
        'precision': precision_score(y_val, y_pred.round(), zero_division=0),
        'recall': recall_score(y_val, y_pred.round()),
        'pr_auc': roc_auc_score(y_val, y_pred.round()),
        'accuracy': accuracy_score(y_val, y_pred.round())
    }
    
    # Log the evaluation metrics
    mlflow.log_metrics(metrics)
    
    return metrics['precision']

# Checking which Classifiers to Focus on

In [13]:
# Setting experiment
mlflow.set_experiment(experiment_name='all-models-experiment')

# Delete the `Default` experiment
expt_id = mlflow.get_experiment_by_name('Default').experiment_id

if expt_id == "0":
    try:
        mlflow.delete_experiment(expt_id)
    except mlflow.exceptions.MlflowException:
        pass
else:
    print('`Default` deleted...')

2023/06/29 23:40:06 INFO mlflow.tracking.fluent: Experiment with name 'all-models-experiment' does not exist. Creating a new experiment.


In [14]:
mlflow.sklearn.autolog()

for model_class in (SVC, 
                    LogisticRegression,
                    RandomForestClassifier, 
                    GradientBoostingClassifier
):

    with mlflow.start_run():

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        
        eval_metrics_logs(y_val, y_pred.round())
        
mlflow.sklearn.autolog(disable=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)
_params = {
    'objective': 'binary:logistic',
    'eval_metric': "logloss",
    'seed': 42,
}

In [16]:
mlflow.xgboost.autolog()

with mlflow.start_run():
    
    mlflow.log_params(_params)
    
    booster = xgb.train(
        params=_params,
        dtrain=train,
        num_boost_round=500,
        evals=[(valid, "validation")],
        early_stopping_rounds=250, 
        verbose_eval=False
    )
    y_pred = booster.predict(valid)
    
    eval_metrics_logs(y_val, y_pred.round())
    
mlflow.xgboost.autolog(disable=True)
    



# Random Forest Classifier

In [17]:
mlflow.set_experiment('rfc-experiment')

2023/06/29 23:41:45 INFO mlflow.tracking.fluent: Experiment with name 'rfc-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/mgubuntu/projects/marketing-mlops/02-experiment-tracking/mlruns/2', creation_time=1688071305403, experiment_id='2', last_update_time=1688071305403, lifecycle_stage='active', name='rfc-experiment', tags={}>

In [18]:
# Defining a random state 
random_state = np.random.default_rng(42)

In [19]:
# Defining the objective function
def objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        model = RandomForestClassifier(**params, n_jobs=-1, 
                                       random_state=42)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
    return {'loss': -precision, 'status': STATUS_OK}

In [20]:
#Definining the hyperparameters
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 150, 10)),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_depth': scope.int(hp.quniform('max_depth', 10, 30, 10)),   
}

In [21]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    rstate=random_state,
    verbose=False
)

In [22]:
params = {'criterion': 'entropy', 
          'max_depth': 20, 
          'n_estimators': 60}

mlflow.sklearn.autolog()

rfc = RandomForestClassifier(**params, n_jobs=-1, 
                             random_state=42)
rfc.fit(X_train, y_train)      

mlflow.sklearn.autolog(disable=True)

2023/06/29 23:43:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1d18f82ccec141a48f1251dd5978fd4a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


# Sci-Gradient Boosting Classifier

In [23]:
mlflow.set_experiment('sk-gbc-experiment')

2023/06/29 23:43:20 INFO mlflow.tracking.fluent: Experiment with name 'sk-gbc-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/mgubuntu/projects/marketing-mlops/02-experiment-tracking/mlruns/3', creation_time=1688071400948, experiment_id='3', last_update_time=1688071400948, lifecycle_stage='active', name='sk-gbc-experiment', tags={}>

In [24]:
def objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        model = GradientBoostingClassifier(**params, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
        return {'loss': -precision, 'status': STATUS_OK}

In [25]:
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 100, 10)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 16, 2)), 
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)), 
}

In [26]:
best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

In [27]:
params = {'min_samples_leaf': 8,
'min_samples_split': 14,
'n_estimators': 90}

mlflow.sklearn.autolog()

gbc = GradientBoostingClassifier(**params, random_state=42)
gbc.fit(X_train, y_train)

mlflow.sklearn.autolog(disable=True)

2023/06/29 23:44:55 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f48605f827ab4ba79d0e102fadaf5897', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


# X-Gradient Boosting Classifier

In [28]:
mlflow.set_experiment('xgbc-experiment')

2023/06/29 23:45:11 INFO mlflow.tracking.fluent: Experiment with name 'xgbc-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/mgubuntu/projects/marketing-mlops/02-experiment-tracking/mlruns/4', creation_time=1688071511314, experiment_id='4', last_update_time=1688071511314, lifecycle_stage='active', name='xgbc-experiment', tags={}>

In [29]:
#Define the objective function
def objective(params):
    
    with mlflow.start_run():
        mlflow.log_params(params)
        model = xgb.XGBClassifier(**params, 
                                  random_state=42, 
                                  objective='binary:logistic')
        model.set_params(early_stopping_rounds=250)
        model.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)],
                  verbose=False)
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
    return {'loss': -precision, 'status': STATUS_OK}

In [30]:
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 80, 10)),
    'max_leaves': scope.int(hp.quniform('max_leaves', 1, 100, 10)),
    'min_child_weight': hp.loguniform('min_child_weight', 1.5, 2.5),
}

In [31]:
best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

In [32]:
params = {'max_leaves':	50, 
'min_child_weight':	7.727056599504389, 
'n_estimators':	50}

mlflow.xgboost.autolog()

xgbc = xgb.XGBClassifier(**params, 
                          random_state=42, 
                          objective='binary:logistic')
xgbc.set_params(early_stopping_rounds=250)
xgbc.fit(X_train, y_train,  eval_set=[(X_val, y_val)], verbose=False)

mlflow.xgboost.autolog(disable=True)

2023/06/29 23:46:00 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a0c8e3e88eca435ba08085f727df96e6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


# Support Vector Classifier

In [33]:
mlflow.set_experiment('svc-experiment')

2023/06/29 23:46:14 INFO mlflow.tracking.fluent: Experiment with name 'svc-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/mgubuntu/projects/marketing-mlops/02-experiment-tracking/mlruns/5', creation_time=1688071574404, experiment_id='5', last_update_time=1688071574404, lifecycle_stage='active', name='svc-experiment', tags={}>

In [34]:
def objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        model = SVC(**params, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
    return {'loss': -precision, 'status': STATUS_OK}

In [35]:
search_space = {
    'C': hp.uniform('C', 0, 10),
    # 'kernel': hp.choice('kernel', ['poly', 'rbf', 'sigmoid']), 
    'degree': scope.int(hp.randint('degree', 1, 12)),
    # 'gamma': hp.choice('gamma', ['scale', 'auto']),
    # 'class_weight': hp.choice('class_weight', [None, 'balanced'])   
}

In [36]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    rstate=random_state,
    verbose=False
)

In [37]:
params = {
    'C': 2.521616767609426,
    'degree': 2
}

mlflow.sklearn.autolog()

svc = SVC(**params, random_state=42)
svc.fit(X_train, y_train)

mlflow.sklearn.autolog(disable=True)

2023/06/29 23:46:58 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a553083ff04f46a6adecf758263b6628', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


# Extreme Gradient Boosting

In [38]:
mlflow.set_experiment("xgboost-experiment")

2023/06/29 23:47:19 INFO mlflow.tracking.fluent: Experiment with name 'xgboost-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/mgubuntu/projects/marketing-mlops/02-experiment-tracking/mlruns/6', creation_time=1688071639654, experiment_id='6', last_update_time=1688071639654, lifecycle_stage='active', name='xgboost-experiment', tags={}>

In [39]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=500,
            evals=[(valid, "validation")],
            early_stopping_rounds=250, 
            verbose_eval=False
        )
        y_pred = booster.predict(valid)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
    return {'loss': -precision, 'status': STATUS_OK}

In [40]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 20, 70, 10)),
    'learning_rate': hp.loguniform('learning_rate', -30, -3),
    'min_child_weight': hp.loguniform('min_child_weight', 1.5, 2.5),
    # 'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'subsample': hp.uniform('subsample', 0.5, 1),
    # 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    # 'gamma': hp.uniform('gamma', 0, 1),
    'objective': 'binary:logistic',
    'eval_metric': "logloss",
    'seed': 42,
}

In [41]:
best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

In [42]:
# params = space_eval(search_space, best_result)
params = {
    'eval_metric':'logloss',
    'gamma':0.9511548717715149,
    'learning_rate':0.014685011379954318,
    'max_depth':149,
    'min_child_weight':7.668601934406394,
    'objective':'binary:logistic',
    'seed':	42,
    'subsample':0.51735171792841
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=500,
    evals=[(valid, "validation")],
    early_stopping_rounds=20,
    verbose_eval=False
)

mlflow.xgboost.autolog(disable=True)

2023/06/29 23:49:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ae42293843304f3d9551f9db317ff6c3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


# Logistic Regression

In [43]:
mlflow.set_experiment("log-reg-experiment")

2023/06/29 23:49:39 INFO mlflow.tracking.fluent: Experiment with name 'log-reg-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/mgubuntu/projects/marketing-mlops/02-experiment-tracking/mlruns/7', creation_time=1688071779259, experiment_id='7', last_update_time=1688071779259, lifecycle_stage='active', name='log-reg-experiment', tags={}>

In [44]:
space = {
    'C': hp.loguniform('C', -20, 4),
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'saga'])
}

In [45]:
def objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
    
        # Create the logistic regression model with the given hyperparameters
        model = LogisticRegression(**params, max_iter=5000)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
    
    return {'loss': -precision, 'status': STATUS_OK}


In [46]:
best_result = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

In [47]:
#Best parameters
params = {'C': 0.08966267017951414, 
          'class_weight': None,
          'penalty': 'l1',
          'solver':	'liblinear'
}

mlflow.sklearn.autolog()

lr = LogisticRegression(**params, max_iter=5000)
lr.fit(X_train, y_train)

mlflow.sklearn.autolog(disable=True)

2023/06/29 23:50:37 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8704baa28a4e41a3942757f8ace0cd73', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [48]:
elapsed_time = (time.time() - start_time)/60
print(f"Execution time: {elapsed_time} minutes")

Execution time: 11.032654253641764 minutes
