# Data Preprocessing

In [1]:
import mlflow
import pickle
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (f1_score,
                             recall_score,
                             roc_auc_score,
                             accuracy_score, 
                             precision_score)

In [2]:
# Read training and validation data
df_train = pd.read_csv('./data/training_data.csv')
df_val = pd.read_csv('./data/validation_data.csv')

In [3]:
# Creating target variable 
y_train = df_train['Response']
y_val = df_val['Response']

In [4]:
# Create the necessary variables
dependants = ['Kidhome', 'Teenhome']

# assuming analysis was conducted in 2014 
now = 2014

# Define the bin edges
bins = [18, 28, 38, 48, 58, 65, np.inf]

# Define the labels for each age group
labels = ['18-27', '28-37', '38-47', '48-57', '58-65', '65+']

# End of financial year
end_fiscal = datetime(2014, 6, 30)

# Redundant features
red_ftrs_1 = ["ID", "Year_Birth", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Response",'Age']

# List of categorical and numeric features
categ_ftrs_1 = ['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Age_Group']

num_ftrs_1 = ['Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Onboard_Days']

In [5]:
# Function to do data cleaning and feature preprocessing
def scrub_data(df):
    
    # Convert 'Kidhome' and 'Teenhome' to categorical
    # but first fillna with the most frequent value
    df[dependants] = df[dependants].fillna(df[dependants].mode().iloc[0])
    df[dependants] = df[dependants].applymap(lambda x: 1 if x > 0 else 0)
    
    # Conversions into 'datetime' data type
    # but first fillna in both variables
    df['Year_Birth'] = df['Year_Birth'].fillna(int(df['Year_Birth'].median()))
    df['Year_Birth'] = pd.to_datetime(df['Year_Birth'], format='%Y')
    
    df['Dt_Customer'] = df['Dt_Customer'].fillna(df['Dt_Customer'].mode().iloc[0])
    df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"])
    
    # Calculate age
    df['Age'] = now - df['Year_Birth'].dt.year
    
    # Create age group feature
    df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    
    # Calculate the number of days since customer enrolled
    df['Onboard_Days'] = (end_fiscal - df['Dt_Customer']).dt.days
    
    # Droping redundant features
    df = df.drop(red_ftrs_1, axis=1)
    
    # handle missing values and scale numeric data
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('normalize', PowerTransformer(method='yeo-johnson')),
    ])
    
    ct = ColumnTransformer([
        ('num_trans', num_transformer, num_ftrs_1),
        ('cat_trans', SimpleImputer(strategy='most_frequent'), categ_ftrs_1)
    ])
        
    df = pd.DataFrame(ct.fit_transform(df), 
                      columns=num_ftrs_1+categ_ftrs_1)
    
    # Ensure that the final df features are in the right data types
    df[categ_ftrs_1] = df[categ_ftrs_1].astype('str')
    df[num_ftrs_1] = df[num_ftrs_1].astype('float')
     
    return df

In [6]:
# Clean and preprocess the data ones
train_data = scrub_data(df_train)
val_data = scrub_data(df_val)

In [7]:
# From DataFrame to dictionary
train_dicts= train_data.to_dict(orient='records')
val_dicts = val_data.to_dict(orient='records')

In [8]:
# Use dictionary vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [9]:
# Save the preprocessor
with open('models/preprocessor.b', 'wb') as f_out:
    pickle.dump(dv, f_out)

# Model Monitoring

In [10]:
# Set tracking uri
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Set experiment
mlflow.set_experiment('best_model')

2023/06/29 23:58:36 INFO mlflow.tracking.fluent: Experiment with name 'best_model' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/mgubuntu/projects/marketing-mlops/02-experiment-tracking/mlruns/8', creation_time=1688072316687, experiment_id='8', last_update_time=1688072316687, lifecycle_stage='active', name='best_model', tags={}>

In [11]:
# Train the best model and log it in mlflow
with mlflow.start_run():
    params = {
        'min_samples_leaf': 8, 
        'min_samples_split': 14, 
        'n_estimators': 90
    }
    
    # Log the parameters
    mlflow.log_params(params)

    gbc = GradientBoostingClassifier(**params, random_state=42)
    gbc.fit(X_train, y_train)
    y_pred = gbc.predict(X_val)
    
    # Calculate the evaluation metrics
    metrics = {
        'f1': f1_score(y_val, y_pred.round()), 
        'precision': precision_score(y_val, y_pred.round(), zero_division=0),
        'recall': recall_score(y_val, y_pred.round()),
        'pr_auc': roc_auc_score(y_val, y_pred.round()),
        'accuracy': accuracy_score(y_val, y_pred.round())
    }
    
    # Log the evaluation metrics
    mlflow.log_metrics(metrics)
    
    # Log the model
    mlflow.sklearn.log_model(gbc, artifact_path='model')
   
    # Log the preprocessor
    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')



In [12]:
# Save the model (optional)
with open('models/gbc.bin', 'wb') as f_out:
    pickle.dump((dv, gbc), f_out)

# Checking Performance

In [13]:
# Model run id (model uri)
logged_model = 'runs:/f48605f827ab4ba79d0e102fadaf5897/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.sklearn
  run_id: f48605f827ab4ba79d0e102fadaf5897

In [14]:
# Log model as a scikit-learn model
gbc_model = mlflow.sklearn.load_model(model_uri=logged_model)
gbc_model

GradientBoostingClassifier(min_samples_leaf=8, min_samples_split=14,
                           n_estimators=90, random_state=42)

In [15]:
y_pred = gbc_model.predict(X_val)
y_pred[-10:]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])