In [55]:
%reload_ext autoreload
%autoreload 2

import sys
from pathlib import Path

import mlflow
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import pendulum
import plotly.express as px
import ppscore as pps
import seaborn as sns
from loguru import logger
from mlflow.models import infer_signature
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.metrics import (r2_score,
                             root_mean_squared_error,
                             mean_absolute_error,
                             max_error,
                            )
from sklearn.model_selection import train_test_split, learning_curve, LearningCurveDisplay
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from ydata_profiling import ProfileReport
from yellowbrick.regressor import PredictionError, ResidualsPlot

sys.path.append(str(Path.cwd().parent))
from settings.params import MODEL_PARAMS, SEED
from src.make_dataset import load_data

In [56]:
TARGET_NAME = MODEL_PARAMS["TARGET_NAME"]
FEATURES = MODEL_PARAMS["DEFAULT_FEATURE_NAMES"]
TARGET_NAME

'revenue'

In [57]:
def load_preprocessed_data(data_path:str):
    data = pd.read_csv(data_path)
    x_train, x_test, y_train, y_test = train_test_split(data.loc[:, FEATURES],
                                                    data[TARGET_NAME],
                                                    test_size=MODEL_PARAMS["TEST_SIZE"],
                                                    random_state=SEED
                                                   )
    logger.info(f"\nX train: {x_train.shape}\nY train: {y_train.shape}\n"
            f"X test: {x_test.shape}\nY test: {y_test.shape}")
    return x_train, x_test, y_train, y_test

In [58]:
x_train, x_test, y_train, y_test = load_preprocessed_data(
    MODEL_PARAMS["DATA_PREPROCESSED_PATH"]
)

[32m2024-08-07 15:23:56.353[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_preprocessed_data[0m:[36m8[0m - [1m
X train: (109, 44)
Y train: (109,)
X test: (28, 44)
Y test: (28,)[0m


In [59]:
from typing import Union, Dict, Any


def eval_metrics(y_actual: Union[pd.DataFrame, pd.Series, np.ndarray],
                 y_pred: Union[pd.DataFrame, pd.Series, np.ndarray]
                 ) -> Dict[str, float]:
    """Compute evaluation metrics.

    Args:
        y_actual: Ground truth (correct) target values
        y_pred: Estimated target values.

    Returns:
        Dict[str, float]: dictionary of evaluation metrics.
            Expected keys are: "rmse", "mae", "mape", "r2", "max_error"

    """
    # Calculate Root mean squared error, named rmse
    rmse = root_mean_squared_error(y_actual, y_pred)
    # Calculate mean absolute error, named mae
    mae = mean_absolute_error(y_actual, y_pred)
    # Calculate R-squared: coefficient of determination, named r2
    r2 = r2_score(y_actual, y_pred)
    # Calculate max error: maximum value of absolute error (y_actual - y_pred), named maxerror
    maxerror = max_error(y_actual, y_pred)
    return {"rmse": rmse,
            "mae": mae,
            "r2": r2,
            "max_error": maxerror
           }

In [60]:
def define_pipeline(numerical_transformer: list,
                    categorical_transformer: list,
                    estimator: Pipeline,
                    target_transformer: bool=False,
                    **kwargs: dict) -> Pipeline:
    """Define pipeline for modeling.

    Args:
        numerical_transformer:
        categorical_transformer:
        target_transformer:
        estimator:
        kwargs:

    Returns:
        Pipeline: sklearn pipeline
    """
    numerical_transformer = make_pipeline(*numerical_transformer)

    categorical_transformer = make_pipeline(*categorical_transformer)

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, make_column_selector(dtype_include=["number"])),
            ("cat", categorical_transformer, make_column_selector(dtype_include=["object", "bool"])),
        ],
        remainder="drop",  # non-specified columns are dropped
        verbose_feature_names_out=False,  # will not prefix any feature names with the name of the transformer
    )
    # Append regressor to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    if target_transformer:
        model_pipe1 = Pipeline(steps=[("preprocessor", preprocessor),
                                      ("estimator", estimator)])
        model_pipe = TransformedTargetRegressor(regressor=model_pipe1,
                                                func=np.log,
                                                inverse_func=np.exp)
    
    
    else:
        model_pipe = Pipeline(steps=[("preprocessor", preprocessor), ("estimator", estimator)])
        
    # logger.info(f"{model_pipe}")
    return model_pipe

In [64]:
mlflow.set_tracking_uri(uri="http://localhost:5000")
# Set the tracking experiment (in this case, House Prices is going to be our experiment name)
mlflow.set_experiment("restaurant_revenue_prediction")

2024/08/07 15:57:02 INFO mlflow.tracking.fluent: Experiment with name 'restaurant_revenue_prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/334374102736216289', creation_time=1723046222304, experiment_id='334374102736216289', last_update_time=1723046222304, lifecycle_stage='active', name='restaurant_revenue_prediction', tags={}>

In [65]:
def train(data_path,run_name, estimator,params):
    """
    trained machine learning model and its associated artifacts with MLflow.

    Parameters:
        run_name (str): Name of the MLflow run.
        params (dict): Hyperparameters used for training the model.
        model (sklearn.base.BaseEstimator): Trained machine learning model.
        of the training dataset.
    Returns:
        tuple: Trained model instance and evaluation metrics.

    """
    with mlflow.start_run(run_name=run_name):
        
        x_train, x_test, y_train, y_test = load_preprocessed_data(data_path)
        # Instantiate the model with specified hyperparameters
        model_instance =  define_pipeline(numerical_transformer=[SimpleImputer(strategy="median"),
                                                     RobustScaler()],
                              categorical_transformer=[SimpleImputer(strategy="constant", fill_value="undefined"),
                                                       OneHotEncoder(drop="if_binary", handle_unknown="ignore")],
                              target_transformer=False,
                              estimator=estimator(**params)
                         )
        model_instance.fit(x_train, y_train)
        predictions = model_instance.predict(x_test)

        # Evaluate the model
        metric_eval = eval_metrics(y_test, predictions)

        # Log evaluation metrics
        mlflow.log_metrics(metric_eval)

        # Log hyperparameters
        mlflow.log_params(params)

        # Set a tag to describe the training
        mlflow.set_tag("Training Info", "Basic  model for revenue prediction")

        # Log the trained model
        signature = infer_signature(x_train, model_instance.predict(x_train))
        model_artifact_path = run_name
        mlflow.sklearn.log_model(
            sk_model=model_instance,
            artifact_path=model_artifact_path,
            signature=signature,
            input_example=x_train,
        )

        return model_instance, metric_eval

In [75]:

train(MODEL_PARAMS["DATA_PREPROCESSED_PATH"],
      "randomForestregressor",
      RandomForestRegressor,
      {"n_estimators":300, "max_depth":40,"max_features":10,"random_state":449,"n_jobs":-1}
     )

[32m2024-08-07 16:22:04.703[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_preprocessed_data[0m:[36m8[0m - [1m
X train: (109, 44)
Y train: (109,)
X test: (28, 44)
Y test: (28,)[0m


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('simpleimputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('robustscaler',
                                                                    RobustScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7f7e2b99bbb0>),
                                                  ('cat',
                                                   Pipeline(steps=[('simpleimputer',
                                                                    SimpleImputer(fill_value='undefined',
                                                                                  strategy='constant')),
                                                          

In [67]:
# Define models and parameters to benchmark
ESTIMATOR_PARAMS = {ElasticNet.__name__: {"estimator": ElasticNet,
                                          "params": {"alpha": 1.0,
                                                     "l1_ratio": 0.5
                                          }
                                         },
                    RandomForestRegressor.__name__: {"estimator": RandomForestRegressor,
                                                     "params": {"n_estimators": 30,
                                                                "max_depth": 3,
                                                                "random_state": SEED
                                                               }
                                             },
                    GradientBoostingRegressor.__name__: {"estimator": GradientBoostingRegressor,
                                                         "params": {"n_estimators": 30,
                                                                    "learning_rate": 0.01,
                                                                    "max_depth": 3,
                                                                    "random_state": SEED
                                                                   }
                                                        }
}

ESTIMATOR_PARAMS

{'ElasticNet': {'estimator': sklearn.linear_model._coordinate_descent.ElasticNet,
  'params': {'alpha': 1.0, 'l1_ratio': 0.5}},
 'RandomForestRegressor': {'estimator': sklearn.ensemble._forest.RandomForestRegressor,
  'params': {'n_estimators': 30, 'max_depth': 3, 'random_state': 50}},
 'GradientBoostingRegressor': {'estimator': sklearn.ensemble._gb.GradientBoostingRegressor,
  'params': {'n_estimators': 30,
   'learning_rate': 0.01,
   'max_depth': 3,
   'random_state': 50}}}

In [68]:
#Entrainement de plusieurs modeles
for model_name, model_configs in ESTIMATOR_PARAMS.items():
    estimator = model_configs["estimator"]
    params = model_configs["params"]
    train(MODEL_PARAMS["DATA_PREPROCESSED_PATH"],
      model_name,
      estimator,
      params
     )

[32m2024-08-07 15:57:49.992[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_preprocessed_data[0m:[36m8[0m - [1m
X train: (109, 44)
Y train: (109,)
X test: (28, 44)
Y test: (28,)[0m
[32m2024-08-07 15:57:52.937[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_preprocessed_data[0m:[36m8[0m - [1m
X train: (109, 44)
Y train: (109,)
X test: (28, 44)
Y test: (28,)[0m
[32m2024-08-07 15:57:55.617[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_preprocessed_data[0m:[36m8[0m - [1m
X train: (109, 44)
Y train: (109,)
X test: (28, 44)
Y test: (28,)[0m
