MLflow quickstart and poc

In [1]:
# Import all necessary libraries
import socket
import mlflow
# import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
                            f1_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
import time

In [2]:
# Set up the mlflow instance and constants
RANDOM_STATE = 1
hostname = socket.gethostname()
tags = {'host': hostname, 'algorithm': 'logistic_regression', 'compute_device': 'cpu',
    'model_lifecycle': 'training', 'operation':'model_training' }
# Creates the database for mlflow
mlflow.set_tracking_uri('sqlite:///mlflow_tracking.db')
# Using a mlflow server endpoint as tracking uri
# mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('learn_mlflow')

2024/03/05 22:45:59 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/03/05 22:45:59 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='file:///d:/Code/learn_mlflow/notebooks/mlruns/2', creation_time=1708793971171, experiment_id='2', last_update_time=1708793971171, lifecycle_stage='active', name='learn_mlflow', tags={}>

In [3]:
tags['compute_device'] = 'gpu'
print(tags)

{'host': 'BRAXIONBUILD', 'algorithm': 'logistic_regression', 'compute_device': 'gpu', 'model_lifecycle': 'training', 'operation': 'model_training'}


In [4]:
# Load the Iris dataset from sklearn
x, y = datasets.load_iris(return_X_y=True)

# Splits the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,
                                                random_state=RANDOM_STATE)

# Define the hyperparameters
params = {
    'max_iter': 400,
    'random_state': RANDOM_STATE,
    'n_jobs':8,
    'multi_class': 'auto'
}

# Train the model
# Using **params, the ** is used to unpack the dictionary
model = LogisticRegression(**params)
model.fit(x_train, y_train)

# Predict on the test set 
y_hat = model.predict(x_test)

# Compute the metrics
# accuracy = accuracy_score(y_test, y_hat), 
# print(accuracy)
precision = precision_score(y_test, y_hat, average='macro')
recall = recall_score(y_test, y_hat, average='macro')
f1_s = f1_score(y_test, y_hat, average='macro')
mse = mean_squared_error(y_test, y_hat)

metrics = {
    # 'accuracy': accuracy,
    'precision': precision, 
    'recall': recall, 
    'f1_score': f1_s, 
    'mse': mse
}

In [5]:
# Check if there is an active run terminate it 
mlflow.end_run()

# Start the MLflow run 
with mlflow.start_run():
    time.sleep(15)
    # Add a dinamyc tag
    tags['compute_device'] = 'cpu'
    
    # Set the tags 
    mlflow.set_tags(tags)

    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the metrics
    mlflow.log_metrics(metrics)

    # Infer  the model signature (inputs and outputs)
    signature = mlflow.models.infer_signature(x_train, model.predict(x_train))
    '''
    # Log the model
    model_metadata = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path='iris_model',
        signature=signature, 
        input_example=x_train, 
        registered_model_name='learn_mlflow_model'
    ) 
    '''