# Hyperparameter-Tuning Tracking

* Create a solution for a ML problem
* Setup the Mlflow server
* Log the hyperparameter tuning using mlflow
* Model inference through mlflow parameters

## Import Libraries

In [20]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from urllib.parse import urlparse

# For Mlflow
import mlflow
import mlflow.sklearn


## Loading Data

In [21]:
tips_df = sns.load_dataset('tips')
print(tips_df.shape)
tips_df.sample(5)

(244, 7)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
12,15.42,1.57,Male,No,Sun,Dinner,2
35,24.06,3.6,Male,No,Sat,Dinner,3
224,13.42,1.58,Male,Yes,Fri,Lunch,2
34,17.78,3.27,Male,No,Sat,Dinner,2
158,13.39,2.61,Female,No,Sun,Dinner,2


## Data Splitting

In [22]:
# Load the Tips dataset from Seaborn
X = tips_df[['tip', 'size']]
y = tips_df['total_bill']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Setting up Mlflow server

Before using this make sure you've executed the following command before running mlflow:<br>
`mlflow ui --backend-store-uri sqlite:///mlflow.db`

In [23]:
#mlflow server --backend-store-uri sqlite:///mlflow.db
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Tips")

2024/04/14 04:37:49 INFO mlflow.tracking.fluent: Experiment with name 'Tips' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/HAIER/Desktop/Python Chilla 2.0/MLOps using Mlflow/mlruns/2', creation_time=1713051469067, experiment_id='2', last_update_time=1713051469067, lifecycle_stage='active', name='Tips', tags={}>

In [24]:
# Just a function to evaluate the performance of the model after training

def evaluate(y,pred):
    rmse = np.sqrt(mean_squared_error(y,pred))
    mae = mean_absolute_error(y,pred)
    r2 = r2_score(y,pred)
    return rmse, mae, r2

## Tracking on Hyperparameter-Tuning

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

# To autolog all parameters, metrics etc
mlflow.sklearn.autolog()

with mlflow.start_run():
    # Define the parameter grid
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    }

    # Initialize the ExtraTreesRegressor
    clf = ExtraTreesRegressor(random_state=42)

    # Initialize mean squared error scorer
    mse_scorer = make_scorer(mean_squared_error)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(clf, param_grid, scoring=mse_scorer, cv=5)

    # Train the GridSearchCV
    grid_search.fit(X_train, y_train)

    # Get the best parameters and best estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Predict on the test set using the best estimator
    y_pred = best_estimator.predict(X_test)

    # Calculate Metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("Best Parameters:", best_params)
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 Score:", r2)

    # To manually log some metrics
    mlflow.log_metric("RMSE",rmse)
    mlflow.log_metric("MAE",mae)
    mlflow.log_metric("R2 Score",r2)

2024/04/14 04:39:48 INFO mlflow.sklearn.utils: Logging the 5 best runs, 22 runs will be omitted.


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
RMSE: 7.04291248067798
MAE: 5.256796010316211
R2 Score: 0.4149844553814255


## Check MetaData of Model that's in Mlflow

In [26]:
logged_model = 'runs:/6aad9a0ef06c41f19a30940548d4326d/best_estimator'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

metadata = loaded_model.metadata
print(metadata)

artifact_path: best_estimator
flavors:
  python_function:
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.sklearn
    model_path: model.pkl
    predict_fn: predict
    python_version: 3.11.8
  sklearn:
    code: null
    pickled_model: model.pkl
    serialization_format: cloudpickle
    sklearn_version: 1.2.2
mlflow_version: 2.9.2
model_size_bytes: 802668
model_uuid: 4d11956037364bfc952dd3cecd7be532
run_id: 6aad9a0ef06c41f19a30940548d4326d
signature:
  inputs: '[{"type": "double", "name": "tip"}, {"type": "long", "name": "size"}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
  params: null
utc_time_created: '2024-04-13 23:39:34.814772'



## Model Inference
* From the metadata it's clear that model requires 4 inputs of type double
* Also the column names should also be provided in order to map them correctly with input variables

In [33]:
# Predict on a Pandas DataFrame.
input_data = [2.604, 2]

# Create a dictionary with the expected column names as keys
input_dict = {
    'tip': input_data[0],
    'size': input_data[1]
}

loaded_model.predict(input_dict)

array([11.406])