In [1]:
import time
start_time = time.time()

In [2]:
# Checking Python's version
!python -V

Python 3.9.16


# Importation

In [3]:
# Importing libraries

import mlflow
import pickle
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_extraction import DictVectorizer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (f1_score,
                             recall_score,
                             roc_auc_score,
                             accuracy_score, 
                             precision_score)

In [4]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("model-vectorizer-as-one")

<Experiment: artifact_location='/home/mgubuntu/projects/marketing-mlops/04-deployment/web-service-mlflow/artifacts_local/1', creation_time=1691053345395, experiment_id='1', last_update_time=1691053345395, lifecycle_stage='active', name='model-vectorizer-as-one', tags={}>

In [5]:
def read_clean_data(filename: str) -> pd.DataFrame:
    
    """Read in & clean the data"""
    df = pd.read_csv(filename)
    
    # Convert 'Kidhome' and 'Teenhome' to categorical
    # but first fillna with the most frequent value
    dependants = ['Kidhome', 'Teenhome']
    df[dependants] = df[dependants].fillna(df[dependants].mode().iloc[0])
    df[dependants] = df[dependants].applymap(lambda x: 1 if x > 0 else 0)
    
    # Conversions into 'datetime' data type
    # but first fillna in both variables
    df['Year_Birth'] = df['Year_Birth'].fillna(int(df['Year_Birth'].median()))
    df['Year_Birth'] = pd.to_datetime(df['Year_Birth'], format='%Y')
    
    df['Dt_Customer'] = df['Dt_Customer'].fillna(df['Dt_Customer'].mode().iloc[0])
    df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"])
    
    # Calculate age
    # assuming analysis was conducted in 2014 
    now = 2014
    df['Age'] = now - df['Year_Birth'].dt.year
    
    # Define the bin edges
    bins = [18, 28, 38, 48, 58, 65, np.inf]
    # Define the labels for each age group
    labels = ['18-27', '28-37', '38-47', '48-57', '58-65', '65+']
    # Create age group feature
    df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    
    # Calculate the number of days since customer enrolled
    end_fiscal = datetime(2014, 6, 30)
    df['Onboard_Days'] = (end_fiscal - df['Dt_Customer']).dt.days
    
    # Droping redundant features
    red_ftrs_1 = ["ID", "Year_Birth", "Dt_Customer", "Z_CostContact", "Z_Revenue",'Age']
    df = df.drop(red_ftrs_1, axis=1)
    
    return df

In [6]:
def miss_norm(df: pd.DataFrame) -> pd.DataFrame:
    """
    Handle missing values then normalize data and return a dictionary
    """
    
    # List of categorical and numeric features
    categ_ftrs_1 = ['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Age_Group']

    num_ftrs_1 = ['Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Onboard_Days']
    
    # Drop the target variable
    df = df.drop('Response', axis=1)
        
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('normalize', PowerTransformer(method='yeo-johnson')),
    ])
    
    ct = ColumnTransformer([
        ('num_trans', num_transformer, num_ftrs_1),
        ('cat_trans', SimpleImputer(strategy='most_frequent'), categ_ftrs_1)
    ])
        
    df = pd.DataFrame(ct.fit_transform(df), 
                      columns=num_ftrs_1+categ_ftrs_1)
        
    # Ensure that the final df features are in the right data types
    df[categ_ftrs_1] = df[categ_ftrs_1].astype('str')
    df[num_ftrs_1] = df[num_ftrs_1].astype('float')
     
    # Return a dictionary    
    return df.to_dict(orient='records')

In [7]:
df_train = read_clean_data('data/training_data.csv')
df_val = read_clean_data('data/validation_data.csv')

target='Response'

y_train = df_train[target].values
y_val = df_val[target].values

dict_train = miss_norm(df_train)
dict_val = miss_norm(df_val)

In [8]:
with mlflow.start_run():
    # Define and log parameters
    params = {
        'min_samples_leaf': 8, 
        'min_samples_split': 14, 
        'n_estimators': 90
    }
    mlflow.log_params(params)
    
    # Make predictions from pipeline
    pipeline = make_pipeline(
        DictVectorizer(),
        GradientBoostingClassifier(**params, random_state=42)
    )
    
    pipeline.fit(dict_train, y_train)
    y_pred = pipeline.predict(dict_val)
    
    
    # Calculate and log the evaluation metrics
    metrics = {
        'f1': f1_score(y_val, y_pred.round()), 
        'precision': precision_score(y_val, y_pred.round(), zero_division=0),
        'recall': recall_score(y_val, y_pred.round()),
        'pr_auc': roc_auc_score(y_val, y_pred.round()),
        'accuracy': accuracy_score(y_val, y_pred.round())
    }
    mlflow.log_metrics(metrics)
    
    # Log the model and the vectorizer in the pipeline as one
    mlflow.sklearn.log_model(pipeline, artifact_path='model')



In [9]:
with mlflow.start_run():
    # Define and log parameters
    params = {
        'min_samples_leaf': 8, 
        'min_samples_split': 14, 
        'n_estimators': 90
    }
    mlflow.log_params(params)
    
    dv = DictVectorizer()
    model = GradientBoostingClassifier(**params, random_state=42)
    
    X_train = dv.fit_transform(dict_train)
    model.fit(X_train, y_train)
    
    X_val = dv.transform(dict_val)
    y_pred = model.predict(X_val)
    
    # Calculate and log the evaluation metrics
    metrics = {
        'f1': f1_score(y_val, y_pred.round()), 
        'precision': precision_score(y_val, y_pred.round(), zero_division=0),
        'recall': recall_score(y_val, y_pred.round()),
        'pr_auc': roc_auc_score(y_val, y_pred.round()),
        'accuracy': accuracy_score(y_val, y_pred.round())
    }
    mlflow.log_metrics(metrics)
    
    mlflow.sklearn.log_model(model, artifact_path='model')
    
    with open('dict_vectorizer.bin', 'wb') as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact('dict_vectorizer.bin')    

In [10]:
elapsed_time = (time.time() - start_time)/60
print(f"Execution time: {elapsed_time} minutes")

Execution time: 0.5172260959943136 minutes
