# Experiment Tracking

* Create a solution for a ML problem
* Setup the Mlflow server
* Log the model training using mlflow
* Model inference through mlflow parameters

## Import Libraries

In [14]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from urllib.parse import urlparse

# For Mlflow
import mlflow
import mlflow.sklearn


## Loading Data

In [9]:
iris = sns.load_dataset('iris')
print(iris.shape)
iris.sample(5)

(150, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
93,5.0,2.3,3.3,1.0,versicolor
94,5.6,2.7,4.2,1.3,versicolor
119,6.0,2.2,5.0,1.5,virginica
8,4.4,2.9,1.4,0.2,setosa
123,6.3,2.7,4.9,1.8,virginica


## Data Splitting

In [3]:
# Load the Iris dataset from Seaborn
X = iris.drop('species', axis=1)
y = iris['species']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Setting up Mlflow server

Before using this make sure you've executed the following command before running mlflow:<br>
`mlflow ui --backend-store-uri sqlite:///mlflow.db`

In [4]:
#mlflow server --backend-store-uri sqlite:///mlflow.db
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Iris-Species")

<Experiment: artifact_location='file:///c:/Users/HAIER/Desktop/Python Chilla 2.0/MLOps using Mlflow/mlruns/1', creation_time=1712955036029, experiment_id='1', last_update_time=1712955036029, lifecycle_stage='active', name='Iris-Species', tags={}>

In [5]:
# Just a function to evaluate the performance of the model after training

def evaluate(y,pred):
    accuracy = accuracy_score(y, pred)
    precision = precision_score(y, pred, average='weighted')
    recall = recall_score(y, pred, average='weighted')
    return accuracy, precision, recall

## Experiment Tracking on Model Building/Training

In [7]:
# To autolog all parameters, metrics etc
mlflow.sklearn.autolog()


with mlflow.start_run():

    # To manually log some tags
    mlflow.set_tag("delevoper","arsalan")
    mlflow.set_tag("model","extratrees")

    # Initialize the ExtraTreesClassifier
    clf = ExtraTreesClassifier(n_estimators=100, random_state=42)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate Metrics
    accuracy, precision, recall = evaluate(y_test, y_pred)

    # To manually log some metrics
    # mlflow.log_metric("accuracy",accuracy)
    # mlflow.log_metric("precision",precision)
    # mlflow.log_metric("recall",recall)



## Check MetaData of Model that's in Mlflow

In [13]:
logged_model = 'runs:/c82650a9e5f245c8b2bcebfc1e700e08/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

metadata = loaded_model.metadata
print(metadata)

artifact_path: model
flavors:
  python_function:
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.sklearn
    model_path: model.pkl
    predict_fn: predict
    python_version: 3.11.8
  sklearn:
    code: null
    pickled_model: model.pkl
    serialization_format: cloudpickle
    sklearn_version: 1.2.2
mlflow_version: 2.9.2
model_size_bytes: 374559
model_uuid: 866a49d5475240c3a25b44b0f4a8ec34
run_id: c82650a9e5f245c8b2bcebfc1e700e08
signature:
  inputs: '[{"type": "double", "name": "sepal_length"}, {"type": "double", "name":
    "sepal_width"}, {"type": "double", "name": "petal_length"}, {"type": "double",
    "name": "petal_width"}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "object", "shape": [-1]}}]'
  params: null
utc_time_created: '2024-04-12 21:20:58.135100'



## Model Inference
* From the metadata it's clear that model requires 4 inputs of type double
* Also the column names should also be provided in order to map them correctly with input variables

In [17]:
# Predict on a Pandas DataFrame.
input_data = [5.6, 2.7, 4.2, 1.3]

# Create a dictionary with the expected column names as keys
input_dict = {
    'sepal_length': input_data[0],
    'sepal_width': input_data[1],
    'petal_length': input_data[2],
    'petal_width': input_data[3]
}

loaded_model.predict(input_dict)

array(['versicolor'], dtype=object)