# Training experiments

In this code, we will train different models to explore different options for solving the problem.

## Import libraries and data

In [40]:
import mlflow
from datetime import datetime
import awswrangler as wr
import random
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from mlflow import MlflowClient

Export the necessary environment variables for working with Minio.

In [41]:
# Export environment variables
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [42]:
mlflow_server = "http://localhost:5000"

mlflow.set_tracking_uri(mlflow_server)

Load the processed data from Minio.


In [43]:
X_train = wr.s3.read_csv("s3://data/train/bike_sharing_demand_X_train_scaled.csv")
y_train = wr.s3.read_csv("s3://data/train/bike_sharing_demand_y_train.csv")
X_test = wr.s3.read_csv("s3://data/test/bike_sharing_demand_X_test_scaled.csv")
y_test = wr.s3.read_csv("s3://data/test/bike_sharing_demand_y_test.csv")

# Model training

In [44]:
def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

Initialize the mlflow experiment

In [45]:
experiment_id = get_or_create_experiment("Bike Sharing Demand")

print(f"Experiment ID: {experiment_id}")

Experiment ID: 1


In [46]:
run_name_parent = "best_hyperparam_"  + datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

Show the size of the training and test sets

In [47]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((12165, 20), (12165, 1), (5214, 20), (5214, 1))

Data normalization is an important step before training the model. 

In [48]:
# Define the parameter grid for Lasso Regression
param_grid_lasso = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0], 
    'fit_intercept': [True, False],
    'max_iter': [1000, 5000, 10000],
}

# Initialize the Lasso Regression model
lasso_model = Lasso()

# Set up Grid Search with 5-fold cross-validation
grid_search_lasso = GridSearchCV(estimator=lasso_model, param_grid=param_grid_lasso, cv=5, scoring='neg_mean_squared_error')

# Start the MLflow run
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):
    # Perform the grid search and fit the model
    grid_search_lasso.fit(X_train, y_train)
    
    # Get the best model from the grid search
    best_lasso_model = grid_search_lasso.best_estimator_
    
    # Make predictions using the best model
    lasso_predictions = best_lasso_model.predict(X_test)
    
    # Calculate metrics
    mse_lasso = mean_squared_error(y_test, lasso_predictions)
    rmse_lasso = root_mean_squared_error(y_test, lasso_predictions)
    mae_lasso = mean_absolute_error(y_test, lasso_predictions)
    r2_lasso = r2_score(y_test, lasso_predictions)
    
    # Log the best parameters and metrics to MLflow
    mlflow.log_param("best_lasso_alpha", best_lasso_model.alpha)
    mlflow.log_param("best_lasso_fit_intercept", best_lasso_model.fit_intercept)
    mlflow.log_param("best_lasso_max_iter", best_lasso_model.max_iter)
    
    mlflow.log_metric("lasso_mse", mse_lasso)
    mlflow.log_metric("lasso_rmse", rmse_lasso)
    mlflow.log_metric("lasso_mae", mae_lasso)
    mlflow.log_metric("lasso_r2", r2_lasso)
    
    # Get the first row of the test set and log it to MLflow
    input_example = X_test[0:1] 
    
    # Define the artifact path
    artifact_path = "best_lasso_model"
    
    # Infer the schema of the input example
    signature = mlflow.models.infer_signature(X_train, best_lasso_model.predict(X_train))
    
    # Log the best Lasso model to the MLflow server
    mlflow.sklearn.log_model(
        sk_model=best_lasso_model,
        artifact_path=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name='bike_sharing_model_dev',
        metadata = {'model_data_version': 1}
    )
    
    # Get the URI of the logged model
    model_uri = mlflow.get_artifact_uri(artifact_path)
    
    # Print the results
    print(f"Best Lasso model logged with MSE: {mse_lasso}, RMSE: {rmse_lasso}, MAE: {mae_lasso}, R²: {r2_lasso}")
    print(f"Best Lasso parameters: {grid_search_lasso.best_params_}")


Registered model 'bike_sharing_model_dev' already exists. Creating a new version of this model...
2024/08/24 21:19:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bike_sharing_model_dev, version 2
Created version '2' of model 'bike_sharing_model_dev'.
2024/08/24 21:19:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run best_hyperparam_2024/08/24-21:19:04" at: http://localhost:5000/#/experiments/1/runs/f500d701d6a14bdca8c508960041c063.
2024/08/24 21:19:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


Best Lasso model logged with MSE: 0.9834108384340852, RMSE: 0.991670730854796, MAE: 0.7676831654585141, R²: 0.5561891798259417
Best Lasso parameters: {'alpha': 0.1, 'fit_intercept': True, 'max_iter': 1000}


## Testing the model

In [49]:
loaded_model = mlflow.sklearn.load_model(model_uri)

In [50]:
X_test = np.array(X_test)

In [51]:
# Get a random element from test set
input_example = X_test[random.randint(0, X_test.shape[0])] 

print(f"Input example: {input_example}")

Input example: [-1.003541   -0.16956604  0.68381845 -0.19711504 -0.19384702 -1.5526699
 -0.59618097 -0.29878575 -0.01282315 -0.58212799 -0.59402796  1.75445413
 -0.40854189  2.50427011 -0.40771959 -0.40895277 -0.40840489 -0.4120971
 -1.27528153  0.58022529]


In [52]:
int(np.exp(loaded_model.predict(input_example.reshape(1, -1))))

  int(np.exp(loaded_model.predict(input_example.reshape(1, -1))))


137

## Register the model

In [54]:
client = MlflowClient()

name = "bike_sharing_model_prod"
desc = "Production model for bike sharing demand prediction"

client.create_registered_model(name=name, description=desc)

tags = best_lasso_model.get_params()
tags["model"] = type(best_lasso_model).__name__
tags["mse"] = mse_lasso

result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

client.set_registered_model_alias(name, "best-model", result.version)

2024/08/24 21:20:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bike_sharing_model_prod, version 1
