# Training experiments

In this code, we will train different models to explore different options for solving the problem.

## Import libraries and data

In [21]:
import mlflow
from datetime import datetime
import awswrangler as wr
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from mlflow import MlflowClient

Export the necessary environment variables for working with Minio.

In [2]:
# Export environment variables
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [3]:
mlflow_server = "http://localhost:5000"

mlflow.set_tracking_uri(mlflow_server)

Load the data set previously processed in the notebook [data_exploration.ipynb](data_exploration.ipynb).

In [4]:
train_data_df = wr.s3.read_csv(f"s3://mlflow/data/experiments/train/bike_sharing_clenaned_1.csv")

In [5]:
X = train_data_df.drop(columns=["log_count"])
y = train_data_df["log_count"]

# Model training

In [6]:
def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

Initialize the mlflow experiment

In [7]:
experiment_id = get_or_create_experiment("experiment_1")

print(f"Experiment ID: {experiment_id}")

Experiment ID: 366059013759980176


In [8]:
run_name_parent = "best_hyperparam_"  + datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

Split data into training and test sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Show the size of the training and test sets

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((7620, 31), (7620,), (3266, 31), (3266,))

Data normalization is an important step before training the model. 

In [11]:
# Define the scaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# Define the parameter grid for Lasso Regression
param_grid_lasso = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0], 
    'fit_intercept': [True, False],
    'max_iter': [1000, 5000, 10000],
}

# Initialize the Lasso Regression model
lasso_model = Lasso()

# Set up Grid Search with 5-fold cross-validation
grid_search_lasso = GridSearchCV(estimator=lasso_model, param_grid=param_grid_lasso, cv=5, scoring='neg_mean_squared_error')

# Start the MLflow run
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):
    # Perform the grid search and fit the model
    grid_search_lasso.fit(X_train_scaled, y_train)
    
    # Get the best model from the grid search
    best_lasso_model = grid_search_lasso.best_estimator_
    
    # Make predictions using the best model
    lasso_predictions = best_lasso_model.predict(X_test_scaled)
    
    # Calculate metrics
    mse_lasso = mean_squared_error(y_test, lasso_predictions)
    rmse_lasso = root_mean_squared_error(y_test, lasso_predictions)
    mae_lasso = mean_absolute_error(y_test, lasso_predictions)
    r2_lasso = r2_score(y_test, lasso_predictions)
    
    # Log the best parameters and metrics to MLflow
    mlflow.log_param("best_lasso_alpha", best_lasso_model.alpha)
    mlflow.log_param("best_lasso_fit_intercept", best_lasso_model.fit_intercept)
    mlflow.log_param("best_lasso_max_iter", best_lasso_model.max_iter)
    
    mlflow.log_metric("lasso_mse", mse_lasso)
    mlflow.log_metric("lasso_rmse", rmse_lasso)
    mlflow.log_metric("lasso_mae", mae_lasso)
    mlflow.log_metric("lasso_r2", r2_lasso)
    
    # Get the first row of the test set and log it to MLflow
    input_example = X_test_scaled[0:1] 
    
    # Define the artifact path
    artifact_path = "best_lasso_model"
    
    # Infer the schema of the input example
    signature = mlflow.models.infer_signature(X_train_scaled, best_lasso_model.predict(X_train_scaled))
    
    # Log the best Lasso model to the MLflow server
    mlflow.sklearn.log_model(
        sk_model=best_lasso_model,
        artifact_path=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name='bike_sharing_model_dev',
        metadata = {'model_data_version': 1}
    )
    
    # Get the URI of the logged model
    model_uri = mlflow.get_artifact_uri(artifact_path)
    
    # Print the results
    print(f"Best Lasso model logged with MSE: {mse_lasso}, RMSE: {rmse_lasso}, MAE: {mae_lasso}, R²: {r2_lasso}")
    print(f"Best Lasso parameters: {grid_search_lasso.best_params_}")


Successfully registered model 'bike_sharing_model_dev'.
2024/08/22 00:03:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bike_sharing_model_dev, version 1
Created version '1' of model 'bike_sharing_model_dev'.
2024/08/22 00:03:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run best_hyperparam_2024/08/21-23:57:28" at: http://localhost:5000/#/experiments/366059013759980176/runs/e2333261b5294a78a4cd9ae706018261.
2024/08/22 00:03:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/366059013759980176.


Best Lasso model logged with MSE: 0.7462500317302668, RMSE: 0.8638576455239988, MAE: 0.6747924114375535, R²: 0.6249601213386964
Best Lasso parameters: {'alpha': 0.1, 'fit_intercept': True, 'max_iter': 1000}


## Testing the model

In [16]:
loaded_model = mlflow.sklearn.load_model(model_uri)

In [19]:
# Get a random element from test set
input_example = X_test_scaled[random.randint(0, X_test_scaled.shape[0])] 

print(f"Input example: {input_example}")

Input example: [-0.17362994  0.66050857 -1.24359269  0.75511666 -1.00447194  0.21713469
 -0.17362994 -1.00447194 -0.59331502 -0.28863409 -0.01145648 -0.30021452
 -0.30228766 -0.29551441  3.26637704 -0.30332069 -0.3007337  -0.30666209
 -0.29787071 -0.30306265 -0.3046087  -0.30254614 -0.40571344 -0.41009042
 -0.41096342 -0.40615205 -0.40921663 -0.40899806  0.67971372 -0.36203276
 -1.37319126]


In [20]:
loaded_model.predict(input_example.reshape(1, -1))

array([5.55160635])

## Register the model

In [27]:
client = MlflowClient()

name = "bike_sharing_model_prod"
desc = "Production model for bike sharing demand prediction"

client.create_registered_model(name=name, description=desc)

tags = best_lasso_model.get_params()
tags["model"] = type(best_lasso_model).__name__
tags["mse"] = mse_lasso

result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

client.set_registered_model_alias(name, "best-model", result.version)

2024/08/22 00:14:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bike_sharing_model_prod, version 1
