In [1]:
import mlflow
import os
from scripts.training_regression_models import Training
from scripts.model_registry import ModelRegistry
from scripts.scoring import Scoring
from scripts.monitoring import Monitoring
from scripts.config_regression import *

## MLflow setup

In [3]:
exp_name = "test"

if not os.path.exists(local_path_save):
    os.makedirs(local_path_save)

#save all metadata in a sqlite db. Artifacts will be saved on local folder ./mlflow    
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# set up MlFlow axperiment
experiment_id = mlflow.set_experiment(exp_name)

2022/12/31 14:22:08 INFO mlflow.tracking.fluent: Experiment with name 'test' does not exist. Creating a new experiment.


## Model training
Train one or more ml models with fixed or grid-like parameters (define desired parameters in scripts/config.py file. Currently only random forest 'rf' and gradient boosting 'gb' are supported)

All results will be tracked in the MLFlow UI (from a terminal positioned on the root run the command 'mlflow ui --backend-store-uri sqlite:///mlflow.db', then browse to http://127.0.0.1:5000/)

In [4]:
models = models
train = Training(input_data_path_train, local_path_save, year_month_train)
train.preprocess_and_train(models)

df shape (3576, 20)
###################### training  lr model #########################
rmse_train =  3.340266027366716 
 rmse_test 16291.005898020625 
 mae_train 2.3235100430914657 
 mae_test 513.3030565867936
###################### training  gb model #########################
  0%|                                                                            | 0/5 [00:00<?, ?trial/s, best loss=?]



                                                                                                                       
 rmse_train = 
5.040272642618933                                                                                                      
rmse_test                                                                                                              
4.8022233511927155                                                                                                     
                                                                                                                       
 rmse_train = 
4.938273994042314                                                                                                      
rmse_test                                                                                                              
4.766068821068128                                                                                                      
          

                                                                                                                       
 rmse_train = 
2.948631454439178                                                                                                      
rmse_test                                                                                                              
4.669732792790688                                                                                                      
[0]	validation_0-rmse:10.70285	validation_1-rmse:10.46221                                                              
[1]	validation_0-rmse:10.08031	validation_1-rmse:9.89532                                                               
[2]	validation_0-rmse:9.50023	validation_1-rmse:9.41953                                                                
[3]	validation_0-rmse:8.96150	validation_1-rmse:8.97089                                                                
[4]	validation_0-rmse:8.4

{'eta': 0.44838218635600224, 'max_depth': 1}

## Model registry management
Identify the best(*) run and store the relative model as 'Production' model. 

Archive former model from 'Production' to 'Archived'

(*) model with lowest test error that doesn't overfit the data: (train -test)/train < of_treshold

In [None]:
#Save best model in model registry as 'Production'
model_reg = ModelRegistry(exp_name, year_month_train, model_name_pref)
model_reg.register_best_run(of_treshold=0.1)

#Archive previous model (from Production to Archived)
model_reg.archive_models(year_month_monitor)

## Scoring 
Score latest available data using the Production model identified above, save scored data

In [None]:
scoring = Scoring(year_month_train, model_name_pref, local_path_save)
scoring.preprocess_and_predict(input_data_path_to_score, scored_data_path)

## Monitoring
Evaluate the performance of the previous month model (the archived one), comparing its prediction with the now available observed target

In [None]:
if year_month_train != '2022-01': #if it is the first passage to production we don't have anything to monitor
    monitor = Monitoring(input_data_path_to_score, scored_data_path, model_name_pref, year_month_monitor, local_path_save)
    monitor.monitor()