In [1]:
# Import the Client Class
from mlflow.tracking import MlflowClient

In [2]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db" 

# Instantiate the Client object
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [3]:
# Check best runs in experiment id 1 (all models)
from mlflow.entities import ViewType
runs = client.search_runs(
    experiment_ids=1,
    filter_string='metrics.precision > 0.523',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=7,
    order_by=['metrics.precision DESC']
)

for run in runs:
    print(f"run id: {run.info.run_id},\
        precision: {run.data.metrics['precision']:.4f},\
        recall: {run.data.metrics['recall']:.4f}")

run id: f1dd542d7cf04b85ac01b35b016e25df,        precision: 0.5373,        recall: 0.7500
run id: 7d5b48a8fe764b5a8e397dd3c8c96c7d,        precision: 0.5333,        recall: 0.5000


In [4]:
# Promoting models to model registry
import mlflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
#Register the default Random Forest Classifier
register_name = 'marketing-campaign-classifier'

rfc_log_model = 'runs:/5c2a48899a884f0498d6bac2468626b5/model'
client.create_model_version(
    name=register_name,
    source="mlruns/1/5c2a48899a884f0498d6bac2468626b5/artifacts/model", 
    run_id=rfc_log_model,
    description=f"Default Random Forest Classifier",
)

#Transition it to staging
client.transition_model_version_stage(
    name=register_name,
    version=2,
    stage="staging",
    archive_existing_versions=False
)

In [None]:
#Register the GradientBoosting Classifier (best model)
gbc_log_model = 'runs:/7671c657fe944c469938a59945f06d53/model'
client.create_model_version(
    name=register_name,
    source="mlruns/8/7671c657fe944c469938a59945f06d53/artifacts/model", 
    run_id=gbc_log_model,
    description=f"Gradient Boosting Classifier turned out to be the 'best' model",
)
#Transition it to production
client.transition_model_version_stage(
    name=register_name,
    version=3,
    stage="production",
    archive_existing_versions=False
)

In [5]:
#Check the list of latest model versions
register_name = 'marketing-campaign-classifier'
latest_versions = client.get_latest_versions(name=register_name)
for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 8, stage: Archived
version: 2, stage: Production
version: 9, stage: Staging


In [6]:
# See which model to promote 

In [7]:
import pickle
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (f1_score,
                             recall_score,
                             roc_auc_score,
                             accuracy_score, 
                             precision_score)

In [8]:
# Create the necessary variables
dependants = ['Kidhome', 'Teenhome']

# assuming analysis was conducted in 2014 
now = 2014

# Define the bin edges
bins = [18, 28, 38, 48, 58, 65, np.inf]

# Define the labels for each age group
labels = ['18-27', '28-37', '38-47', '48-57', '58-65', '65+']

# End of financial year
end_fiscal = datetime(2014, 6, 30)

# Redundant features
red_ftrs_1 = ["ID", "Year_Birth", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Response",'Age']

# List of categorical and numeric features
categ_ftrs_1 = ['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Age_Group']

num_ftrs_1 = ['Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Onboard_Days']

In [9]:
# Function to do data cleaning and feature preprocessing
def scrub_data(filename):
    
    df = pd.read_csv(filename)
    y_data = df['Response']
    
    # Convert 'Kidhome' and 'Teenhome' to categorical
    # but first fillna with the most frequent value
    df[dependants] = df[dependants].fillna(df[dependants].mode().iloc[0])
    df[dependants] = df[dependants].applymap(lambda x: 1 if x > 0 else 0)
    
    # Conversions into 'datetime' data type
    # but first fillna in both variables
    df['Year_Birth'] = df['Year_Birth'].fillna(int(df['Year_Birth'].median()))
    df['Year_Birth'] = pd.to_datetime(df['Year_Birth'], format='%Y')
    
    df['Dt_Customer'] = df['Dt_Customer'].fillna(df['Dt_Customer'].mode().iloc[0])
    df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"])
    
    # Calculate age
    df['Age'] = now - df['Year_Birth'].dt.year
    
    # Create age group feature
    df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    
    # Calculate the number of days since customer enrolled
    df['Onboard_Days'] = (end_fiscal - df['Dt_Customer']).dt.days
    
    # Droping redundant features
    df = df.drop(red_ftrs_1, axis=1)
    
    # handle missing values and scale numeric data
    ct = ColumnTransformer([
        ('num_trans', SimpleImputer(strategy='median'), num_ftrs_1),
        ('cat_trans', SimpleImputer(strategy='most_frequent'), categ_ftrs_1)
    ])
        
    df = pd.DataFrame(ct.fit_transform(df), 
                      columns=num_ftrs_1+categ_ftrs_1)
    
    # Ensure that the final df features are in the right data types
    df[categ_ftrs_1] = df[categ_ftrs_1].astype('str')
    df[num_ftrs_1] = df[num_ftrs_1].astype('float')
     
    return (df, y_data)

In [10]:
X_data, y_data = scrub_data("./data/synthetic-data.csv")

In [11]:
# Function to vectorize the data
def preprocess(df, dv):
    df_dicts= df.to_dict(orient='records')
    return dv.transform(df_dicts)
    

In [12]:
with open("models/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)
    
X_data = preprocess(X_data, dv)

In [13]:
def test_model(name, X_data, y_data, stage):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_data)
    precision = precision_score(y_data, y_pred.round(), zero_division=0)
    recall = recall_score(y_data, y_pred.round())
    return {f"precision: {precision}, recall: {recall}"}

In [15]:
%%time 
test_model(name=register_name, stage="Staging", 
           X_data=X_data, y_data=y_data)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: user 807 ms, sys: 40.1 ms, total: 847 ms
Wall time: 852 ms


{'precision: 0.44587458745874586, recall: 0.9996300406955235'}

In [None]:
#Transition it to production
client.transition_model_version_stage(
    name=register_name,
    version=2,
    stage="production",
    archive_existing_versions=True
)