In [11]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import mlflow
from mlflow.models import infer_signature
from sklearn.metrics import classification_report,accuracy_score

In [2]:
iris = load_iris()
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data['target'] = iris.target


In [3]:
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
print("\nMissing values:\n", data.isnull().sum())

# Standardizing the features
scaler = StandardScaler()
X = scaler.fit_transform(data.iloc[:, :-1])  # Scale features only
y = data['target']



Missing values:
 sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
mlflow.set_tracking_uri(uri=" http://127.0.0.1:5000")

2024/12/28 14:13:19 INFO mlflow.tracking.fluent: Experiment with name 'Basic ML project' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/359615374808457462', creation_time=1735413199073, experiment_id='359615374808457462', last_update_time=1735413199073, lifecycle_stage='active', name='Basic ML project', tags={}>

In [None]:
model = RandomForestClassifier(random_state=42)
param_grid = {
'n_estimators': [50, 100, 150],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)


print("\nBest parameters found by GridSearchCV:", grid_search.best_params_)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best parameters found by GridSearchCV: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}


In [16]:
mlflow.set_experiment("ML project")

2024/12/28 14:40:30 INFO mlflow.tracking.fluent: Experiment with name 'ML project' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/802956528055571557', creation_time=1735414830297, experiment_id='802956528055571557', last_update_time=1735414830297, lifecycle_stage='active', name='ML project', tags={}>

In [26]:
with mlflow.start_run():
    model = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }

    # Log the grid search parameter grid as a string (since it's nested)
    mlflow.log_param("param_grid", str(param_grid))

    # Perform GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("\nBest parameters found by GridSearchCV:", grid_search.best_params_)
    best_model = grid_search.best_estimator_

    # Log the best parameters
    mlflow.log_params(grid_search.best_params_)

    # Evaluate on the training set
    y_train_pred = best_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print("\nTraining Accuracy:", train_accuracy)

    # Evaluate on the test set
    y_test_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Test Accuracy:", test_accuracy)

    # Log the accuracy as metrics
    mlflow.log_metrics({"train_accuracy": train_accuracy, "test_accuracy": test_accuracy})

    # Set custom tags for the experiment
    mlflow.set_tags({"exp": "basic ML experiment tracking"})

    # Infer signature for the model
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_train, y_train)

    # Log the model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="iris",
        signature=signature,
        input_example=X_test,
        registered_model_name="RandomForest"
    )

Fitting 5 folds for each of 27 candidates, totalling 135 fits

Best parameters found by GridSearchCV: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}

Training Accuracy: 0.975
Test Accuracy: 1.0


Successfully registered model 'RandomForest'.
2024/12/28 14:48:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest, version 1


üèÉ View run vaunted-calf-865 at:  http://127.0.0.1:5000/#/experiments/802956528055571557/runs/838d609496554cfe9e363f6ec41e8471
üß™ View experiment at:  http://127.0.0.1:5000/#/experiments/802956528055571557


Created version '1' of model 'RandomForest'.


## MUTIPLE RUNS WITH DIFFERENT PARAMETERS IN SAME EXPERIMENT WITH A VERSION 2 model registation... 
#### (version 2 because model is registered with same name as earlier )

In [27]:
with mlflow.start_run():
    model = RandomForestClassifier(random_state=22)
    param_grid = {
        'n_estimators': [100, 150],
        'max_depth': [5, 10, 20],
        'min_samples_split': [2, 5, 10]
    }

    # Log the grid search parameter grid as a string (since it's nested)
    mlflow.log_param("test_param_grid", str(param_grid))

    # Perform GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("\nBest parameters found by GridSearchCV:", grid_search.best_params_)
    best_model = grid_search.best_estimator_

    # Log the best parameters
    mlflow.log_params(grid_search.best_params_)

    # Evaluate on the training set
    y_train_pred = best_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print("\nTraining Accuracy:", train_accuracy)

    # Evaluate on the test set
    y_test_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Test Accuracy:", test_accuracy)

    # Log the accuracy as metrics
    mlflow.log_metrics({"train_accuracy": train_accuracy, "test_accuracy": test_accuracy})

    # Set custom tags for the experiment
    mlflow.set_tags({"exp": "basic ML experiment tracking"})

    # Infer signature for the model
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_train, y_train)

    # Log the model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="iris",
        signature=signature,
        input_example=X_test,
        registered_model_name="RandomForest"
    )

Fitting 5 folds for each of 18 candidates, totalling 90 fits

Best parameters found by GridSearchCV: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 150}

Training Accuracy: 1.0
Test Accuracy: 1.0


Registered model 'RandomForest' already exists. Creating a new version of this model...
2024/12/28 15:04:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest, version 2


üèÉ View run clumsy-ox-990 at:  http://127.0.0.1:5000/#/experiments/802956528055571557/runs/1e040d6a5cd548d2a6cd2986500cdcca
üß™ View experiment at:  http://127.0.0.1:5000/#/experiments/802956528055571557


Created version '2' of model 'RandomForest'.


## Inferencing from mlflow artifacts 

In [28]:
from mlflow.models import validate_serving_input

model_uri = 'runs:/1e040d6a5cd548d2a6cd2986500cdcca/iris'

# The model is logged with an input example. MLflow converts
# it into the serving payload format for the deployed model endpoint,
# and saves it to 'serving_input_payload.json'
serving_payload = """{
  "inputs": [
    [
      0.310997534138703,
      -0.5923730118389191,
      0.5354085615261401,
      0.0008775478952676988
    ],
    [
      -0.1736739476359015,
      1.7095946507475455,
      -1.169714245881954,
      -1.18381211071744
    ],
    [
      2.249683461237124,
      -1.0527665443562113,
      1.7858319536254093,
      1.448831575088577
    ],
    [
      0.18982966369505214,
      -0.36217624558027245,
      0.42173370769893376,
      0.3957741007661703
    ],
    [
      1.1591726272442622,
      -0.5923730118389191,
      0.5922459884397431,
      0.2641419164758693
    ],
    [
      -0.5371775589668552,
      0.7888075857129598,
      -1.2833890997091604,
      -1.052179926427139
    ],
    [
      -0.29484181807955345,
      -0.36217624558027245,
      -0.08980313452349442,
      0.13250973218556866
    ],
    [
      1.2803404976879142,
      0.09821728693702086,
      0.7627582691805523,
      1.448831575088577
    ],
    [
      0.4321654045823549,
      -1.973553609390797,
      0.42173370769893376,
      0.3957741007661703
    ],
    [
      -0.052506077192250644,
      -0.8225697780975647,
      0.08070914621731488,
      0.0008775478952676988
    ],
    [
      0.7956690159133086,
      0.3284140531956675,
      0.7627582691805523,
      1.0539350222176747
    ],
    [
      -1.2641847816287635,
      -0.1319794793216258,
      -1.3402265266227635,
      -1.4470764792980415
    ],
    [
      -0.4160096885232043,
      1.0190043519716065,
      -1.3970639535363667,
      -1.3154442950077407
    ],
    [
      -1.1430169111851116,
      0.09821728693702086,
      -1.2833890997091604,
      -1.4470764792980415
    ],
    [
      -0.9006811702978099,
      1.7095946507475455,
      -1.2833890997091604,
      -1.18381211071744
    ],
    [
      0.5533332750260058,
      0.5586108194543131,
      0.5354085615261401,
      0.5274062850564712
    ],
    [
      0.7956690159133086,
      -0.1319794793216258,
      1.1606202575757745,
      1.3171993907982766
    ],
    [
      -0.29484181807955345,
      -1.282963310614858,
      0.08070914621731488,
      -0.13075463639503299
    ],
    [
      -0.1736739476359015,
      -0.5923730118389191,
      0.42173370769893376,
      0.13250973218556866
    ],
    [
      0.6745011454696578,
      -0.5923730118389191,
      1.0469454037485681,
      1.3171993907982766
    ],
    [
      -1.3853526520724144,
      0.3284140531956675,
      -1.2265516727955572,
      -1.3154442950077407
    ],
    [
      0.310997534138703,
      -0.1319794793216258,
      0.6490834153533465,
      0.7906706536370729
    ],
    [
      -1.0218490407414607,
      0.7888075857129598,
      -1.2265516727955572,
      -1.052179926427139
    ],
    [
      0.6745011454696578,
      -0.5923730118389191,
      1.0469454037485681,
      1.1855672065079756
    ],
    [
      2.492019202124427,
      1.7095946507475455,
      1.5016448190573937,
      1.0539350222176747
    ],
    [
      1.0380047568006114,
      -0.1319794793216258,
      0.8195956960941558,
      1.448831575088577
    ],
    [
      1.0380047568006114,
      -1.282963310614858,
      1.1606202575757745,
      0.7906706536370729
    ],
    [
      1.1591726272442622,
      0.3284140531956675,
      1.2174576844893779,
      1.448831575088577
    ],
    [
      -1.2641847816287635,
      -0.1319794793216258,
      -1.3402265266227635,
      -1.18381211071744
    ],
    [
      -1.2641847816287635,
      0.09821728693702086,
      -1.2265516727955572,
      -1.3154442950077407
    ]
  ]
}"""

# Validate the serving payload works on the model
validate_serving_input(model_uri, serving_payload)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])