In [5]:
%reload_ext autoreload
%autoreload 2

import sys
from pathlib import Path

import mlflow
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import pendulum
import plotly.express as px
import ppscore as pps
import seaborn as sns
from loguru import logger
from mlflow.models import infer_signature
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.datasets import fetch_openml
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.metrics import (r2_score,
                             root_mean_squared_error,
                             mean_absolute_percentage_error,
                             mean_absolute_error,
                             max_error,
                            )
from sklearn.model_selection import train_test_split, learning_curve, LearningCurveDisplay
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from ydata_profiling import ProfileReport
from yellowbrick.regressor import PredictionError, ResidualsPlot

sys.path.append(str(Path.cwd().parent))
from settings.params import MODEL_PARAMS, SEED
from src.make_dataset import load_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_data(MODEL_PARAMS["TRAIN_PATH"])

[32m2024-08-04 23:05:54.215[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m17[0m - [1mDataset lo load from : ../data/train.csv[0m
[32m2024-08-04 23:05:54.225[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m23[0m - [1mData shape: (137, 43)[0m
[32m2024-08-04 23:05:54.360[0m | [1mINFO    [0m | [36msrc.make_dataset[0m:[36mload_data[0m:[36m24[0m - [1mData description:                Id          P1          P2          P3          P4          P5  \
count  137.000000  137.000000  137.000000  137.000000  137.000000  137.000000   
mean    68.000000    4.014599    4.408759    4.317518    4.372263    2.007299   
std     39.692569    2.910391    1.514900    1.032337    1.016462    1.209620   
min      0.000000    1.000000    1.000000    0.000000    3.000000    1.000000   
25%     34.000000    2.000000    4.000000    4.000000    4.000000    1.000000   
50%     68.000000    3.000000    5.000000    4.000000    4.000000    2.000

In [3]:
from typing import Union, Dict, Any


def eval_metrics(y_actual: Union[pd.DataFrame, pd.Series, np.ndarray],
                 y_pred: Union[pd.DataFrame, pd.Series, np.ndarray]
                 ) -> Dict[str, float]:
    """Compute evaluation metrics.

    Args:
        y_actual: Ground truth (correct) target values
        y_pred: Estimated target values.

    Returns:
        Dict[str, float]: dictionary of evaluation metrics.
            Expected keys are: "rmse", "mae", "mape", "r2", "max_error"

    """
    # Calculate Root mean squared error, named rmse
    rmse = root_mean_squared_error(y_actual, y_pred)
    # Calculate mean absolute error, named mae
    mae = mean_absolute_error(y_actual, y_pred)
    # Mean absolute percentage error (MAPE)
    mape = mean_absolute_percentage_error(y_actual, y_pred)
    # Calculate R-squared: coefficient of determination, named r2
    r2 = r2_score(y_actual, y_pred)
    # Calculate max error: maximum value of absolute error (y_actual - y_pred), named maxerror
    maxerror = max_error(y_actual, y_pred)
    return {"rmse": rmse,
            "mae": mae,
            "mape": mape,
            "r2": r2,
            "max_error": maxerror
           }

In [6]:
def define_pipeline(numerical_transformer: list,
                    categorical_transformer: list,
                    estimator: Pipeline,
                    target_transformer: bool=False,
                    **kwargs: dict) -> Pipeline:
    """Define pipeline for modeling.

    Args:
        numerical_transformer:
        categorical_transformer:
        target_transformer:
        estimator:
        kwargs:

    Returns:
        Pipeline: sklearn pipeline
    """
    numerical_transformer = make_pipeline(*numerical_transformer)

    categorical_transformer = make_pipeline(*categorical_transformer)

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, make_column_selector(dtype_include=["number"])),
            ("cat", categorical_transformer, make_column_selector(dtype_include=["object", "bool"])),
        ],
        remainder="drop",  # non-specified columns are dropped
        verbose_feature_names_out=False,  # will not prefix any feature names with the name of the transformer
    )
    # Append regressor to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    if target_transformer:
        model_pipe1 = Pipeline(steps=[("preprocessor", preprocessor),
                                      ("estimator", estimator)])
        model_pipe = TransformedTargetRegressor(regressor=model_pipe1,
                                                func=np.log,
                                                inverse_func=np.exp)
    
    
    else:
        model_pipe = Pipeline(steps=[("preprocessor", preprocessor), ("estimator", estimator)])
        
    # logger.info(f"{model_pipe}")
    return model_pipe

## Train / Test split

In [8]:
# Séparer les données en train et test (25%)

FEATURES = MODEL_PARAMS["DEFAULT_FEATURE_NAMES"]

x_train, x_test, y_train, y_test = train_test_split(data.loc[:, FEATURES],
                                                    data[TARGET_NAME],
                                                    test_size=MODEL_PARAMS["TEST_SIZE"],
                                                    random_state=SEED
                                                   )

logger.info(f"\nX train: {x_train.shape}\nY train: {y_train.shape}\n"
            f"X test: {x_test.shape}\nY test: {y_test.shape}")

KeyError: "['Day', 'Month', 'Years', 'Day_Name'] not in index"