# Basic zenml quickstart guide stuff

- introduction to what the quickstart is about
- what will be covered here / what we'll do

## Intro

- what is zenml
- diagram showing the quickstart workflow etc

# Installation

things that need installing

In [3]:
#TODO: add things relating to cloudflare pipelines etc and zenml installation

In [None]:
!zenml integration install sklearn mlflow -y
!zenml init

# automatically restart kernel
import IPython
IPython.Application.instance().kernel.do_shutdown(restart=True)

# Explain the example / what we're doing

- the dataset we're using
- how this is part of a common workflow
- trying some things out / train some baseline models

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

from zenml import step
from zenml.steps import Output


@step(enable_cache=True)
def training_data_loader() -> (
    Output(
        X_train=pd.DataFrame,
        X_test=pd.DataFrame,
        y_train=pd.Series,
        y_test=pd.Series,
    )
):
    """Load the Census Income dataset as tuple of Pandas DataFrame / Series."""
    # Load the dataset
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    column_names = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        "income",
    ]
    data = pd.read_csv(
        url, names=column_names, na_values="?", skipinitialspace=True
    )

    # Drop rows with missing values
    data = data.dropna()

    # Encode categorical features and drop original columns
    categorical_cols = [
        "workclass",
        "education",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "native-country",
    ]
    data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

    # Encode target feature
    data["income"] = data["income"].apply(
        lambda x: 1 if x.strip() == ">50K" else 0
    )

    # Separate features and target
    X = data.drop("income", axis=1)
    y = data["income"]

    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test

Explain that we want to try it out, so we can just call the step independently of ZenML just as a Python function

In [5]:
X_train, X_test, y_train, y_test = training_data_loader()

In [6]:
X_train

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
19863,53,168539,5,0,0,70,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
24342,49,56841,13,0,0,70,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
10027,28,154571,10,0,0,40,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
25710,60,188236,6,0,0,40,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
13824,53,87158,9,0,0,40,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32171,40,67852,9,0,0,35,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
5875,41,120539,10,3103,0,40,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
935,37,176900,9,0,0,99,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
17056,56,51662,7,0,0,40,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False


Training two models now

- SGD Classifier
- Random Forest Classifier

And using MLflow to track the hyperparams and metrics

In [7]:
import mlflow

from sklearn.base import ClassifierMixin
from sklearn.ensemble import RandomForestClassifier

from zenml.client import Client

experiment_tracker = Client().active_stack.experiment_tracker


@step(enable_cache=True, experiment_tracker=experiment_tracker.name)
def random_forest_trainer_mlflow(
    X_train: pd.DataFrame,
    y_train: pd.Series,
) -> ClassifierMixin:
    """Train a sklearn Random Forest classifier and log to MLflow."""
    mlflow.sklearn.autolog()  # log all model hparams and metrics to MLflow
    model = RandomForestClassifier()
    model.fit(X_train.to_numpy(), y_train.to_numpy())
    train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())
    print(f"Train accuracy: {train_acc}")
    return model

from sklearn.linear_model import SGDClassifier


@step(enable_cache=True, experiment_tracker=experiment_tracker.name)
def sgd_trainer_mlflow(
    X_train: pd.DataFrame,
    y_train: pd.Series,
) -> ClassifierMixin:
    """Train a SGD classifier and log to MLflow."""
    mlflow.sklearn.autolog()  # log all model hparams and metrics to MLflow
    model = SGDClassifier()
    model.fit(X_train.to_numpy(), y_train.to_numpy())
    train_acc = model.score(X_train.to_numpy(), y_train.to_numpy())
    print(f"Train accuracy: {train_acc}")
    return model

Now adding an evaluator to return the best performing of the two models.

In [8]:
@step
def evaluator(
    X_test: pd.DataFrame,
    y_test: pd.Series,
    model1: ClassifierMixin,
    model2: ClassifierMixin,
) -> ClassifierMixin:
    """Calculate the accuracy on the test set and return the best model of two."""
    test_acc1 = model1.score(X_test.to_numpy(), y_test.to_numpy())
    test_acc2 = model2.score(X_test.to_numpy(), y_test.to_numpy())
    print(f"Test accuracy ({model1.__class__.__name__}): {test_acc1}")
    print(f"Test accuracy ({model2.__class__.__name__}): {test_acc2}")
    return model1 if test_acc1 > test_acc2 else model2

Define a step that registers to our model registry

In [11]:
from zenml.integrations.mlflow.steps.mlflow_registry import (
    mlflow_register_model_step,
)

register_model = mlflow_register_model_step.with_options(
        parameters=dict(
            name="zenml-quickstart-model",
            description="The first run of the Quickstart pipeline.",
        )
    )

Now we can define the pipeline itself

- explain a bit about pipelines

In [12]:
from zenml import pipeline

@pipeline(enable_cache=True)
def train_and_register_model_pipeline() -> ClassifierMixin:
    """Train a model."""
    X_train, X_test, y_train, y_test = training_data_loader()
    model1 = random_forest_trainer_mlflow(X_train=X_train, y_train=y_train)
    model2 = sgd_trainer_mlflow(X_train=X_train, y_train=y_train)
    best_model = evaluator(
        X_test=X_test, y_test=y_test, model1=model1, model2=model2
    )
    register_model(best_model)
    return best_model

Register our local stack that's able to handle the code we've written above

- go through the different parts of it
- also diagrams

In [None]:
# Register the MLflow experiment tracker
!zenml experiment-tracker register mlflow_tracker --flavor=mlflow

# Register the MLflow model registry
!zenml model-registry register mlflow_registry --flavor=mlflow

# Register the MLflow model deployer
!zenml model-deployer register mlflow_deployer --flavor=mlflow

# Register a new stack with the new stack components
!zenml stack register quickstart_stack -a default\
                                       -o default\
                                       -d mlflow_deployer\
                                       -e mlflow_tracker\
                                       -r mlflow_registry\

!zenml stack set quickstart_stack

Run the pipeline

In [13]:
train_and_register_model_pipeline()

[33mThe [0m[33m@step[33m decorator that you used to define your bento_builder_step step is deprecated. Check out our docs https://docs.zenml.io for information on how to define steps in a more intuitive and flexible way![0m
[33mThe [0m[33m@step[33m decorator that you used to define your  step is deprecated. Check out our docs https://docs.zenml.io for information on how to define steps in a more intuitive and flexible way![0m




[1;35mReloading configuration file /Users/strickvl/coding/zenml/repos/zenml/examples/quickstart/new_quickstart/.zen/config.yaml[0m
[1;35mRegistered pipeline [0m[33mtrain_and_register_model_pipeline[1;35m (version 2).[0m
[1;35mRunning pipeline [0m[33mtrain_and_register_model_pipeline[1;35m on stack [0m[33mquickstart_stack[1;35m (caching enabled)[0m
[1;35mStep [0m[33mtraining_data_loader[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mtraining_data_loader[1;35m.[0m
[1;35mStep [0m[33mrandom_forest_trainer_mlflow[1;35m has started.[0m


INFO:root:copying /var/folders/49/7x17fsrn4knfk852z31cs_vm0000gn/T/tmp2pm77fmg/training_roc_curve.png -> /Users/strickvl/Library/Application Support/zenml/local_stores/64d60faf-1bab-484a-9456-2521c2fe48a9/mlruns/726261655699474961/446416b37faa44658198fadae9c8d4bd/artifacts
INFO:root:copying /var/folders/49/7x17fsrn4knfk852z31cs_vm0000gn/T/tmp2pm77fmg/training_precision_recall_curve.png -> /Users/strickvl/Library/Application Support/zenml/local_stores/64d60faf-1bab-484a-9456-2521c2fe48a9/mlruns/726261655699474961/446416b37faa44658198fadae9c8d4bd/artifacts
INFO:root:copying /var/folders/49/7x17fsrn4knfk852z31cs_vm0000gn/T/tmp2pm77fmg/training_confusion_matrix.png -> /Users/strickvl/Library/Application Support/zenml/local_stores/64d60faf-1bab-484a-9456-2521c2fe48a9/mlruns/726261655699474961/446416b37faa44658198fadae9c8d4bd/artifacts


Train accuracy: 1.0
[1;35mStep [0m[33mrandom_forest_trainer_mlflow[1;35m has finished in 16.540s.[0m
[1;35mStep [0m[33msgd_trainer_mlflow[1;35m has started.[0m
[1;35mUsing cached version of [0m[33msgd_trainer_mlflow[1;35m.[0m
[1;35mStep [0m[33mevaluator[1;35m has started.[0m
Test accuracy (RandomForestClassifier): 0.8491629371788496
Test accuracy (SGDClassifier): 0.7851815017404277
[1;35mStep [0m[33mevaluator[1;35m has finished in 1.047s.[0m
[1;35mStep [0m[33mmlflow_register_model_step[1;35m has started.[0m
[1;35mMLflow model registry does not take a version as an argument. Registering a new version for the model [0m[33m'zenml-quickstart-model'[1;35m a version will be assigned automatically.[0m


2023/06/14 15:33:42 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: zenml-quickstart-model, version 2


[1;35mRegistered model zenml-quickstart-model with version 2 from source file:///Users/strickvl/Library/Application Support/zenml/local_stores/64d60faf-1bab-484a-9456-2521c2fe48a9/mlruns/726261655699474961/446416b37faa44658198fadae9c8d4bd/artifacts/model.[0m
[1;35mStep [0m[33mmlflow_register_model_step[1;35m has finished in 0.696s.[0m
[1;35mPipeline run [0m[33mtrain_and_register_model_pipeline-2023_06_14-13_33_21_772657[1;35m has finished in 21.016s.[0m
[1;35mDashboard URL: http://127.0.0.1:8237/workspaces/default/pipelines/105228ad-159e-47e9-adb4-3966f9817316/runs[0m


Talk about the pipeline output

Now we've trained our model, and we've found the best one, we want to deploy it and run some inference on the deployed model

In [None]:
from zenml.integrations.mlflow.steps.mlflow_deployer import mlflow_model_registry_deployer_step
from zenml.integrations.mlflow.steps.mlflow_registry import mlflow_register_model_step
from zenml.model_registries.base_model_registry import ModelRegistryModelMetadata

model_deployer = mlflow_model_registry_deployer_step.with_options(
    parameters=dict(
        registry_model_name="zenml-quickstart-model",
    )
)

Something about services + why we're doing it that way

In [None]:
from zenml.services import BaseService
from zenml.client import Client


@step(enable_cache=False)
def prediction_service_loader() -> BaseService:
    """Load the model service of our train_evaluate_deploy_pipeline."""
    client = Client()
    model_deployer = client.active_stack.model_deployer
    services = model_deployer.find_model_server(
        pipeline_name="train_and_register_model_pipeline",
        running=True,
    )
    service = services[0]
    return service

@step
def predictor(
    service: BaseService,
    data: pd.DataFrame,
) -> Output(predictions=list):
    """Run a inference request against a prediction service"""
    service.start(timeout=10)  # should be a NOP if already started
    prediction = service.predict(data.to_numpy())
    prediction = prediction.argmax(axis=-1)
    print(f"Prediction is: {[prediction.tolist()]}")
    return [prediction.tolist()]

Explain our new pipeline

In [None]:
@pipeline
def register_and_deploy_model() -> None:
    """Print the name of the model."""
    prediction_service_loader.after(model_deployer)
    best_model = train_and_register_model_pipeline()
    model_deployer(best_model)
    _, _, inference_data, _ = training_data_loader()
    model_deployment_service = prediction_service_loader()
    predictor(service=model_deployment_service, data=inference_data)



In [None]:
!zenml model-registry models list

In [None]:
!zenml model-registry models list-versions zenml-quickstart-model

In [None]:
!zenml model-deployer models describe "22ad3957-57d9-42d0-9bce-4d508191dafd"

In [None]:
register_and_deploy_model()