#### Model Training

### Librairies

In [72]:

# reload modules before executing user code.
#%reload_ext autoreload
#%autoreload 2

import sys
from pathlib import Path
import dill
import numpy as np
import optuna
import pandas as pd
import pendulum 
import mlflow
from mlflow.models import infer_signature
from loguru import logger
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (r2_score,
                             mean_squared_error,
                             mean_absolute_percentage_error,
                             max_error,
                             mean_absolute_error
                            )
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from typing import Dict

sys.path.append(str(Path.cwd().parent))
from settings.params import (DATA_DIR_INPUT,
                             DATA_DIR_OUTPUT,
                             MODEL_PARAMS,
                             REPORT_DIR,
                             TIMEZONE,
                            HOME_DIR, 
                            )

set_config(display="diagram", print_changed_only=False)
pd.set_option("display.max_columns", None)


In [71]:
EXECUTION_DATE = pendulum.now(tz=TIMEZONE)

logger.info(f"Execution date: {EXECUTION_DATE}")

[32m2023-08-22 23:56:06.893[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mExecution date: 2023-08-22T23:56:06.879319+00:00[0m


In [14]:
HOME_DIR

PosixPath('/Users/baldita/Desktop/Courses/DIC3/mlops/project/mlops-project-dic3')

### Entrainement des modeles 

- Recuperation des donnees et verification de conformite

A noter que les donnees que nous allons utiliser sont celles deja pretraitees dans le notebook d'exploration des donnees

In [15]:
train_data = pd.read_csv(str(DATA_DIR_OUTPUT) + '/train.csv')
train_data.describe()

Unnamed: 0,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P13,P14,P15,P16,P19,P20,P21,P22,P23,P24,P26,P27,P29,P30,P31,P32,P33,P35,P36,P37,Year,Month,Years Old,revenue
count,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0
mean,0.430657,1.430657,4.014599,4.408759,4.317518,4.372263,2.007299,3.357664,5.423358,5.153285,5.445255,5.489051,3.262774,5.080292,1.416058,1.386861,1.941606,4.905109,4.547445,2.270073,2.226277,3.423358,1.372263,1.470803,1.145985,3.135036,2.729927,1.941606,2.525547,1.138686,2.029197,2.211679,1.116788,2008.678832,7.058394,6.321168,4453533.0
std,0.496985,0.511567,2.910391,1.5149,1.032337,1.016462,1.20962,2.134235,2.296809,1.858567,1.834793,1.847561,1.910767,1.036527,2.729583,2.398677,3.505807,5.604467,3.708041,2.05263,1.23069,4.559609,2.304112,2.612024,2.067039,1.680887,5.536647,3.512093,5.230117,1.69854,3.436272,4.168211,1.790768,4.027359,3.590769,4.027359,2576072.0
min,0.0,0.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,1.0,4.0,4.0,1.0,3.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1996.0,1.0,1.0,1149870.0
25%,0.0,1.0,2.0,4.0,4.0,4.0,1.0,2.0,5.0,4.0,4.0,5.0,2.0,5.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2007.0,4.0,4.0,2999068.0
50%,0.0,1.0,3.0,5.0,4.0,4.0,2.0,3.0,5.0,5.0,5.0,5.0,3.0,5.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2010.0,8.0,5.0,3939804.0
75%,1.0,2.0,4.0,5.0,5.0,5.0,2.0,4.0,5.0,5.0,5.0,5.0,4.0,5.0,2.0,2.0,3.0,5.0,5.0,3.0,3.0,5.0,2.0,2.5,2.0,3.0,4.0,3.0,3.0,2.0,4.0,3.0,2.0,2011.0,10.0,8.0,5166635.0
max,1.0,2.0,12.0,7.5,7.5,7.5,8.0,10.0,10.0,10.0,10.0,10.0,10.0,7.5,15.0,10.0,15.0,25.0,15.0,15.0,5.0,25.0,10.0,12.5,12.5,7.5,25.0,15.0,25.0,6.0,15.0,20.0,8.0,2014.0,12.0,19.0,19696940.0


In [16]:
test_data = pd.read_csv(str(DATA_DIR_OUTPUT) + '/test.csv')
test_data.head()

Unnamed: 0,City Group,Type,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P13,P14,P15,P16,P19,P20,P21,P22,P23,P24,P26,P27,P29,P30,P31,P32,P33,P35,P36,P37,Year,Month,Years Old
0,1,1,1,4.0,4.0,4.0,1,2,5,4,5,5,5,4.0,0,0,0,5,5,3,1,4,0,0.0,0.0,3.0,0,0,0,0,0,0,0,2011,1,4
1,1,2,3,4.0,4.0,4.0,2,2,5,3,4,4,2,5.0,0,0,0,5,5,3,2,1,0,0.0,0.0,3.0,0,0,0,0,0,0,0,2011,3,4
2,0,1,3,4.0,4.0,4.0,2,2,5,4,4,5,4,5.0,0,0,0,5,5,5,5,5,0,0.0,0.0,3.0,0,0,0,0,0,0,0,2013,10,2
3,1,2,2,4.0,4.0,4.0,2,3,5,4,5,4,3,5.0,0,0,0,4,4,3,2,2,0,0.0,0.0,3.0,0,4,0,0,0,0,0,2013,5,2
4,1,1,2,4.0,4.0,4.0,1,2,5,4,5,4,3,4.0,0,0,0,1,5,3,1,1,0,0.0,0.0,3.0,0,0,0,0,0,0,0,2013,7,2


In [17]:
train_data['revenue'].describe()

count    1.370000e+02
mean     4.453533e+06
std      2.576072e+06
min      1.149870e+06
25%      2.999068e+06
50%      3.939804e+06
75%      5.166635e+06
max      1.969694e+07
Name: revenue, dtype: float64

### Split des donnees 
Les donnees d'entrainement sont splittees en en donnees de validation

In [18]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

( len(train_data), len(val_data))

(109, 28)

In [19]:
y_train = train_data['revenue']    
X_train = train_data.drop(['revenue'],axis=1)
y_val = val_data['revenue']
X_val = val_data.drop(['revenue'],axis=1)
X_test = test_data

### Definition des metriques

In [38]:
def eval_metrics(y_actual,y_pred) -> Dict[str, float]:
    # Root mean squared error
    rmse = mean_squared_error(y_actual, y_pred, squared=False)
    # mean absolute error
    mae = mean_absolute_error(y_actual, y_pred)
    # R-squared: coefficient of determination
    r2 = r2_score(y_actual, y_pred)
    # max error: maximum value of absolute error (y_actual - y_pred)
    maxerror = max_error(y_actual, y_pred)
    return {"rmse": rmse,
            "mae": mae,
            "r2": r2,
            "max_error": maxerror
           }

## Training 
Pour les modeles nous allons entrainer sur les donnees de train dans un premier temps. Ensuite nous alons appliquer une ACP, afin de reduire la dimension des donnees (qui est de 36) avant d'entrainer sur les donnees de train. Enfin nous allons comparer l'ensemble des modeles obtenus avec ou sans ACP et en choisir le meilleur modele

- ACP

Normalisation des variables

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# transform data
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)
X_val_scale = scaler.transform(X_val)


Application dde l'ACP apres normalisation 

In [21]:
from sklearn.decomposition import PCA
#https://www.geeksforgeeks.org/principal-component-analysis-with-python/

pca = PCA(n_components = 5)
X_train_PCA = pca.fit_transform(X_train) 
X_test_PCA = pca.transform(X_test) 

### Feature Engineering Pipeline

- Fonctions de prediction

In [39]:
"""def predict_model(model,X,y_act):
    y_pred = model.predict(X)
    rms = mean_squared_error(y_act, y_pred,squared=False)
    return rms"""

'def predict_model(model,X,y_act):\n    y_pred = model.predict(X)\n    rms = mean_squared_error(y_act, y_pred,squared=False)\n    return rms'

In [40]:
"""def testDataPred(model,X):
  y_test = model.predict(X)
  dataFrame = pd.DataFrame({'Id': y_test['Id'], 'Prediction': y_test}) 
  return dataFrame"""

"def testDataPred(model,X):\n  y_test = model.predict(X)\n  dataFrame = pd.DataFrame({'Id': y_test['Id'], 'Prediction': y_test}) \n  return dataFrame"

### Linear avec le Lasso Regression
Regresion lineaire avec la regularisation Lasso croisee

- Train set

In [56]:
from sklearn import linear_model
model_LR1 = linear_model.LassoCV(max_iter=10000,alphas=(0.0001,0.01,0.1,1),n_alphas=300,cv=5)
model_LR1.fit(X_train_scale,y_train)
y_train_LR_pred = model_LR1.predict(X_train_scale)
y_val_LR_pred = model_LR1.predict(X_val_scale)
train_metrics_lr1 = eval_metrics(y_train, y_train_LR_pred)
val_metrics_lr1 = eval_metrics(y_val, y_val_LR_pred)


logger.info(f"Train_LassoCV: {train_metrics_lr1}")
logger.info(f"Val_LassoCV: {val_metrics_lr1}")

[32m2023-08-22 23:28:00.940[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mTrain_LassoCV: {'rmse': 1813352.129708278, 'mae': 1368188.2988138413, 'r2': 0.35624066405779997, 'max_error': 7724057.245741125}[0m
[32m2023-08-22 23:28:00.941[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mVal_LassoCV: {'rmse': 3742495.06785661, 'mae': 2501914.3127362416, 'r2': -0.14487007109329308, 'max_error': 13078463.48696628}[0m


- Linear model avec l'analyse en composante princiapale

In [57]:
model_LR2 = linear_model.LassoCV(max_iter=100000,alphas=(0.0001,0.01,0.1,1),n_alphas=1000,cv=5)
model_LR2.fit(X_train_PCA,y_train)
y_train_LR2_pred = model_LR2.predict(X_train_PCA)
train_metrics_lr2 = eval_metrics(y_train, y_train_LR2_pred)

logger.info(f"Train_LassoCV: {train_metrics_lr2}")


[32m2023-08-22 23:28:04.260[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mTrain_LassoCV: {'rmse': 2166023.1916953875, 'mae': 1545647.4020882894, 'r2': 0.08148669505000306, 'max_error': 11346635.844886761}[0m


### Random Forest

- Avec les données normalisés

In [53]:
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

params = {
    "n_estimators": randint(10,1000),
    "max_depth": randint(1,10),
    "min_samples_split": uniform(0.1,0.8),
    'max_features':['sqrt', 'log2']
}

RF_model = RandomForestRegressor()
model_rf1 = RandomizedSearchCV(RF_model, params, cv=6, n_iter=100, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
model_rf1.fit(X_train_scale, y_train)
y_train_RF_pred = model_rf1.predict(X_train_scale)
y_val_RF_pred = model_rf1.predict(X_val_scale)
train_metrics_rf = eval_metrics(y_train, y_train_RF_pred)
val_metrics_rf = eval_metrics(y_val, y_val_RF_pred)


logger.info(f"Train_rf: {train_metrics_rf}")
logger.info(f"Val_rf: {val_metrics_rf}")

[32m2023-08-22 23:16:57.886[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mTrain_rf: {'rmse': 2088564.1955350847, 'mae': 1479142.8946937984, 'r2': 0.14600583352493202, 'max_error': 10633000.034060238}[0m
[32m2023-08-22 23:16:57.903[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mVal_rf: {'rmse': 3438601.7120857923, 'mae': 1948756.625689457, 'r2': 0.033509745739770946, 'max_error': 15013282.852325318}[0m


#### Random Forest Avec les données transformés avec le PCA

In [54]:
model_rf2 = RandomizedSearchCV(RF_model, params, cv=2, n_iter=100, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
model_rf2.fit(X_train_PCA, y_train)
y_train_rf2_pred = model_rf2.predict(X_train_PCA)
train_metrics_rf_acp = eval_metrics(y_train, y_train_rf2_pred)

logger.info(f"Train_rf2: {train_metrics_rf_acp}")

[32m2023-08-22 23:25:28.811[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mTrain_rf2: {'rmse': 2084789.24616046, 'mae': 1450898.6727801345, 'r2': 0.14909012593666315, 'max_error': 11447106.533460714}[0m


### XGBOOST

In [None]:
!pip3 install xgboost

#### XGBOOST avec les données normalisés : data train

In [55]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
import xgboost as xgb
params = {
    "learning_rate": uniform(0.001,1),
    "n_estimators": randint(100,1000),
    "max_depth": randint(1,10),     
    "colsample_bytree": uniform(0.1,0.8),
    "reg_alpha": [0.0001,0.001,0.01,0.1,1,10],
    "reg_lambda": [0.0001,0.001,0.01,0.1,1,10]
}
xgb_model = xgb.XGBRegressor()
rand_xgb = RandomizedSearchCV(xgb_model, params, cv=2, n_iter=100, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
rand_xgb.fit(X_train_scale, y_train)
y_train_xgb_pred = rand_xgb.predict(X_train_scale)
y_val_xgb_pred = rand_xgb.predict(X_val_scale)
train_metrics_xgb = eval_metrics(y_train, y_train_xgb_pred)
val_metrics_xgb = eval_metrics(y_val, y_val_xgb_pred)


logger.info(f"Train_xgb: {train_metrics_xgb}")
logger.info(f"Val_xgb: {val_metrics_xgb}")

[32m2023-08-22 23:26:24.092[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mTrain_xgb: {'rmse': 341678.0694420003, 'mae': 184133.02981651376, 'r2': 0.9771443563571226, 'max_error': 1317315.0}[0m
[32m2023-08-22 23:26:24.102[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mVal_xgb: {'rmse': 3293562.8853327916, 'mae': 2011588.919642857, 'r2': 0.11332257985096394, 'max_error': 14553092.0}[0m


### Comparaison des modeles

Etant donne que notre objectif est de faire des predictions assez precises, nous allons choisir le RMSE et MAE comme metriques principales

In [58]:
from tabulate import tabulate 

In [63]:
dict = {
        'Models'     : ['Lasso Regression - Standard scaling','Lasso Regression - PCA','Random Forest - RandomSearchCV','Random Forest - PCA-RandomSearchCV','XGBOOST - Standard scaling - RandomSearchCV'],
        'Train RMSE' : [train_metrics_lr1["rmse"],train_metrics_lr2["rmse"],train_metrics_rf["rmse"],train_metrics_rf_acp["rmse"],train_metrics_xgb["rmse"]],
        'VAL RMSE'  : [val_metrics_lr1["rmse"],"--",val_metrics_rf["rmse"],"--",val_metrics_xgb["rmse"]]
}
df = pd.DataFrame(dict)
print(tabulate(df, headers = 'keys', tablefmt = 'psql')) 

+----+---------------------------------------------+------------------+--------------------+
|    | Models                                      |       Train RMSE | VAL RMSE           |
|----+---------------------------------------------+------------------+--------------------|
|  0 | Lasso Regression - Standard scaling         |      1.81335e+06 | 3742495.06785661   |
|  1 | Lasso Regression - PCA                      |      2.16602e+06 | --                 |
|  2 | Random Forest - RandomSearchCV              |      2.08856e+06 | 3438601.7120857923 |
|  3 | Random Forest - PCA-RandomSearchCV          |      2.08479e+06 | --                 |
|  4 | XGBOOST - Standard scaling - RandomSearchCV | 341678           | 3293562.8853327916 |
+----+---------------------------------------------+------------------+--------------------+


In [64]:
dict = {
        'Models'     : ['Lasso Regression - Standard scaling','Lasso Regression - PCA','Random Forest - RandomSearchCV','Random Forest - PCA-RandomSearchCV','XGBOOST - Standard scaling - RandomSearchCV'],
        'Train MAE' : [train_metrics_lr1["mae"],train_metrics_lr2["mae"],train_metrics_rf["mae"],train_metrics_rf_acp["mae"],train_metrics_xgb["mae"]],
        'VAL MAE'  : [val_metrics_lr1["mae"],"--",val_metrics_rf["mae"],"--",val_metrics_xgb["mae"]]
}
df = pd.DataFrame(dict)
print(tabulate(df, headers = 'keys', tablefmt = 'psql')) 

+----+---------------------------------------------+------------------+--------------------+
|    | Models                                      |        Train MAE | VAL MAE            |
|----+---------------------------------------------+------------------+--------------------|
|  0 | Lasso Regression - Standard scaling         |      1.36819e+06 | 2501914.3127362416 |
|  1 | Lasso Regression - PCA                      |      1.54565e+06 | --                 |
|  2 | Random Forest - RandomSearchCV              |      1.47914e+06 | 1948756.625689457  |
|  3 | Random Forest - PCA-RandomSearchCV          |      1.4509e+06  | --                 |
|  4 | XGBOOST - Standard scaling - RandomSearchCV | 184133           | 2011588.919642857  |
+----+---------------------------------------------+------------------+--------------------+


En se basant sur le score des **Root Mean Squared Error** et du **Mean Absolute Error**, nous pourrons dire que le meilleur modele est le XGBoost pour prédire nos données

### Engeristrement du model

In [65]:
import pickle
pickle_out = open(str(HOME_DIR) + "/models/bestmodel.pkl","wb")
pickle.dump(rand_xgb,pickle_out)
pickle_out.close()

### Tracking avec MlFlow

In [83]:
experiment_id = mlflow.create_experiment("restaurant_revenue")

In [85]:
# Useful for multiple runs (only doing one run in this sample notebook)
with mlflow.start_run(run_name=f"{EXECUTION_DATE.strftime('%Y%m%d_%H%m%S')}-restaurant_revenue",
                      experiment_id=experiment_id,
                      tags={"version": "v1", "priority": "P1"},
                      description="restaurant revenue modeling",) as mlf_run:
    print(f"run_id: {mlf_run.info.run_id}")
    print(f"version tag value: {mlf_run.data.tags.get('version')}")
    print("--")

    # Select number of estimator
    iterations = int(input("Iteration(s): "))
    mlflow.log_param("n_iters", iterations)
    # Model definition
    params = {
    "learning_rate": uniform(0.001,1),
    "n_estimators": randint(100,1000),
    "max_depth": randint(1,10),     
    "colsample_bytree": uniform(0.1,0.8),
    "reg_alpha": [0.0001,0.001,0.01,0.1,1,10],
    "reg_lambda": [0.0001,0.001,0.01,0.1,1,10]
    }
    xgb_model = xgb.XGBRegressor()
    rand_xgb = RandomizedSearchCV(xgb_model, params, cv=2, n_iter=iterations, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
    rand_xgb.fit(X_train_scale, y_train)

    # Evaluate Metrics
    y_train_xgb_pred = rand_xgb.predict(X_train_scale)
    y_val_xgb_pred = rand_xgb.predict(X_val_scale)
    train_metrics = eval_metrics(y_train, y_train_xgb_pred)
    val_metrics = eval_metrics(y_val, y_val_xgb_pred)

    # log out metrics
    logger.info(f"Train: {train_metrics}")
    logger.info(f"Val: {val_metrics}")
    
    # Infer model signature
    predictions = rand_xgb.predict(X_train_scale)
    signature = infer_signature(X_train_scale, predictions)

    # Log parameter, metrics, and model to MLflow
    for group_name, set_metrics in [("train", train_metrics),
                                    ("test", val_metrics),
                                   ]:
        for metric_name, metric_value in set_metrics.items():
            mlflow.log_metric(f"{group_name}_{metric_name}", metric_value)
    # mlflow.sklearn.log_model(reg, "model", signature=signature)

run_id: 1608d9f036c94ef989da3c1bb3e19ebd
version tag value: v1
--


[32m2023-08-23 00:32:23.797[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [1mTrain: {'rmse': 2186358.2026584535, 'mae': 1352751.0871559633, 'r2': 0.06415940669211007, 'max_error': 12288435.5}[0m
[32m2023-08-23 00:32:23.833[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mVal: {'rmse': 3544197.505142372, 'mae': 1925417.3214285714, 'r2': -0.0267614627785957, 'max_error': 15762912.25}[0m


In [82]:
!mlflow ui --host "0.0.0.0"  --port 5001

[2023-08-23 00:23:11 +0000] [95425] [INFO] Starting gunicorn 20.1.0
[2023-08-23 00:23:11 +0000] [95425] [INFO] Listening at: http://0.0.0.0:5001 (95425)
[2023-08-23 00:23:11 +0000] [95425] [INFO] Using worker: sync
[2023-08-23 00:23:11 +0000] [95426] [INFO] Booting worker with pid: 95426
[2023-08-23 00:23:11 +0000] [95427] [INFO] Booting worker with pid: 95427
[2023-08-23 00:23:11 +0000] [95428] [INFO] Booting worker with pid: 95428
[2023-08-23 00:23:11 +0000] [95429] [INFO] Booting worker with pid: 95429
^C
[2023-08-23 00:30:21 +0000] [95425] [INFO] Handling signal: int
