# Corporacion Favorita - New Superb Forecasting Model - 

## Machine Learning Model Pipeline

Made by 4B Consultancy (Janne Heuvelmans, Georgi Duev, Alexander Engelage, Sebastiaan de Bruin) - 2024

In this data pipeline, 

The following steps are made within this notebook:  

>-0. Import Packages 

>-1. Load final dataset and aggregate dataset to weekly level
    - 1.1 Load final dataset made in Data Preperation Pipeline Notebook
    - 1.2 Aggregate dataset to weekly level

>-2. Column transformers and Train, Test, Validation Split

>-3. Models

>-4. Pick best model one and optimize with grid search

# 0. Import Packages

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import statsmodels.api as sm

In [None]:
from sklearn.model_selection import train_test_split

# 1. Load final dataset and aggregate dataset to weekly level

## 1.1. Load final dataset

In [None]:
def f_get_data(i=0):

    # Define path.
    # c_path = "C:/Users/sebas/OneDrive/Documenten/GitHub/Supermarketcasegroupproject/Group4B/data/raw/"

    c_path = "C:/Users/alexander/Documents/0. Data Science and AI for Experts/EAISI_4B_Supermarket/data/processed/"

    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "holidays_events",  # 1
        "items",  # 2
        "stores",  # 3
    )

    print(f"\nReading file {i}\n")

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### 1.3. Importing raw data
Importing parquet files with importing function (giving basic information)

In [None]:
# Sales History per year
df_sales = f_get_data(0)

## 1.2. Aggregate dataset to weekly level

- Group the DataFrame by store number, item number, year, and week number, then aggregate the columns
--> "unit_sales","onpromotion", "holiday_local_count","holiday_regional_count","holiday_national_count",


In [None]:
def df_sales_agg_week(df):

    # Pre-sort the DataFrame, should be faster? To-do: test this?

    df = df.sort_values(["store_nbr", "item_nbr", "year", "week_number"])

    df = (
        df.groupby(
            [
                "store_nbr",
                "item_nbr",
                "year",
                "week_number",
            ],
            sort=False,  # We've already sorted, so no need to sort again/double
        )
        .agg(
            {
                "unit_sales": "sum",
                "onpromotion": "sum",
                "holiday_local_count": "sum",
                "holiday_regional_count": "sum",
                "holiday_national_count": "sum",
            }
        )
        .reset_index()
    )

    return df

In [None]:
df.columns

# 2. Column transformers and Train, Test, Validation Split

## 2.1. Column transformers

In [None]:
features = [
    "store_nbr",
    "item_nbr",
    "date",  # drop date column?
    "onpromotion",
    "holiday_local_count",
    "holiday_national_count",
    "holiday_regional_count",
    "store_type",
    "store_cluster",
    "item_family",
    "item_class",
    "perishable",
    "store_status",
    "item_status",
    "year",
    "weekday",
    "week_nbr",
    "week_number_cum",
]

target_variable = ["unit_sales"]

X = df[features]
y = df[target_variable]

In [None]:
# 	Create	Column	Transformer	with	3	types	of	transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)

X = preprocessor.fit_transform(X)

## 2.2 Train, Test, Validation Split

In [None]:
def train_test_val_split(df, train_end, test_end):

    # Ensure the df is sorted by date
    df = df.sort_index()

    # Split the df in train, test en validation subsets
    train = df[:train_end]
    test = df[train_end:test_end]
    val = df[test_end:]

    # Create X and y for each subset
    X_train, y_train = train[features], train[target_variable]
    X_test, y_test = test[features], test[target_variable]
    X_val, y_val = val[features], val[target_variable]

    # Print information about the splits
    print("Train set:")
    print("X_train shape: {}".format(X_train.shape))
    print("y_train shape: {}".format(y_train.shape))
    print("Training Min Date: {}".format(X_train.index.min()))
    print("Training Max Date: {}".format(X_train.index.max()))

    print("\nTest set:")
    print("X_test shape: {}".format(X_test.shape))
    print("y_test shape: {}".format(y_test.shape))
    print("Test Min Date: {}".format(X_test.index.min()))
    print("Test Max Date: {}".format(X_test.index.max()))

    print("\nValidation set:")
    print("X_val shape: {}".format(X_val.shape))
    print("y_val shape: {}".format(y_val.shape))
    print("Validation Min Date: {}".format(X_val.index.min()))
    print("Validation Max Date: {}".format(X_val.index.max()))

    return X_train, y_train, X_test, y_test, X_val, y_val

To-do: do we split based on dates or based on weeks since start?

In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(
    df, train_end="2016-06-01", test_end="2017-01-01"
)

# 3. Models

## 3.1. Models list to compare in model

In [None]:
from sktime.forecasting.naive import NaiveForecaster
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

models = {
    "Naive Forecast": NaiveForecaster(),  # Naive as baseline performance?
    "Linear	Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbor Regressor": KNeighborsRegressor(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random	Forest	Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost	Regressor": AdaBoostRegressor(),
}

## 3.2. Evaulation Metrics and Evaluate Model functions

In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

In [None]:
def evaluate_models(models, X_train, y_train, X_val, y_val, X_test, y_test):

    # Initialize an empty list to store results
    results = []

    for model_name, model in models.items():
        model.fit(X_train, y_train)

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)

        # Evaluate Train, Validation, and Test dataset
        model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(
            y_train, y_train_pred
        )
        model_val_mae, model_val_rmse, model_val_r2 = evaluate_model(y_val, y_val_pred)
        model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(
            y_test, y_test_pred
        )

        # Append results to list
        results.append(
            {
                "Model": model_name,
                "Train RMSE": model_train_rmse,
                "Train MAE": model_train_mae,
                "Train R2": model_train_r2,
                "Validation RMSE": model_val_rmse,
                "Validation MAE": model_val_mae,
                "Validation R2": model_val_r2,
                "Test RMSE": model_test_rmse,
                "Test MAE": model_test_mae,
                "Test R2": model_test_r2,
            }
        )

    # Create DataFrame from results list
    results_df = pd.DataFrame(
        results,
        columns=[
            "Model",
            "Train RMSE",
            "Train MAE",
            "Train R2",
            "Validation RMSE",
            "Validation MAE",
            "Validation R2",
            "Test RMSE",
            "Test MAE",
            "Test R2",
        ],
    )

    # Sort results by Test RMSE
    results_df = results_df.sort_values("Test RMSE")

    return results_df

To-do: how to make test forecast per week, so 26 iternations of weeks 1 for one?

we now have a good function. however the test (26 weeks) and validation (~26 weeks) shoukd not be done all at once for all the weks. But iterative week for week, as it is for forecasting store sales, and the sales of next week are in reality not known in advance. So we think we want it to loop the test and val set week for week. But dont want do add the val or test set to the training data, to prevent data drift

In [None]:
import pandas as pd


def evaluate_models(models, X_train, y_train, X_val, y_val, X_test, y_test, n_weeks):
    # Initialize an empty list to store results
    results = []

    # Calculate the number of unique weeks for week_column in val and test sets
    n_weeks_val = X_val[week_column].nunique()
    n_weeks_test = X_test[week_column].nunique()

    for model_name, model in models.items():
        model.fit(X_train, y_train)

        # Initialize the lists to store week-by-week predictions
        val_predictions = []
        test_predictions = []

        # Iterate over each week for validation
        for week in range(n_weeks):
            # Make predictions for the current validation week
            y_val_pred = model.predict(X_val.iloc[week : week + 1])
            val_predictions.append(y_val_pred)

            # Update the model with all previous training data plus current validation data
            model.fit(
                X_train.append(X_val.iloc[: week + 1]),
                y_train.append(y_val.iloc[: week + 1]),
            )

        y_val_pred = pd.Series(val_predictions).flatten()
        model_val_mae, model_val_rmse, model_val_r2 = evaluate_model(y_val, y_val_pred)

        # Iterate over each week for testing
        for week in range(n_weeks):
            # Make predictions for the current test week
            y_test_pred = model.predict(X_test.iloc[week : week + 1])
            test_predictions.append(y_test_pred)

            # Only for future predictions, refit with all previous data
            model.fit(
                X_train.append(X_val),
                y_train.append(y_val).append(y_test.iloc[: week + 1]),
            )

        y_test_pred = pd.Series(test_predictions).flatten()
        model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(
            y_test, y_test_pred
        )

        # Evaluate Train dataset
        y_train_pred = model.predict(X_train)
        model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(
            y_train, y_train_pred
        )

        # Append results to list
        results.append(
            {
                "Model": model_name,
                "Train RMSE": model_train_rmse,
                "Train MAE": model_train_mae,
                "Train R2": model_train_r2,
                "Validation RMSE": model_val_rmse,
                "Validation MAE": model_val_mae,
                "Validation R2": model_val_r2,
                "Test RMSE": model_test_rmse,
                "Test MAE": model_test_mae,
                "Test R2": model_test_r2,
            }
        )

    # Create DataFrame from results list
    results_df = pd.DataFrame(
        results,
        columns=[
            "Model",
            "Train RMSE",
            "Train MAE",
            "Train R2",
            "Validation RMSE",
            "Validation MAE",
            "Validation R2",
            "Test RMSE",
            "Test MAE",
            "Test R2",
        ],
    )

    # Sort results by Test RMSE
    results_df = results_df.sort_values("Test RMSE")

    return results_df

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


def evaluate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2


def evaluate_train(model, X_train, y_train):
    y_train_pred = model.predict(X_train)
    mae, rmse, r2 = evaluate_metrics(y_train, y_train_pred)
    return {"Train MAE": mae, "Train RMSE": rmse, "Train R2": r2}


def evaluate_validation(model, X_val, y_val, week_column):
    n_weeks_val = X_val[week_column].nunique()
    val_predictions = []
    unique_weeks = sorted(X_val[week_column].unique())

    for week in range(n_weeks_val):
        current_week = unique_weeks[week]
        current_data = X_val[X_val[week_column] <= current_week]
        y_val_pred = model.predict(current_data)
        val_predictions.extend(
            y_val_pred[-len(X_val[X_val[week_column] == current_week]) :]
        )

    y_val_pred = pd.Series(val_predictions)
    mae, rmse, r2 = evaluate_metrics(y_val, y_val_pred)
    return {"Validation MAE": mae, "Validation RMSE": rmse, "Validation R2": r2}


def evaluate_test(model, X_test, y_test, week_column):
    n_weeks_test = X_test[week_column].nunique()
    test_predictions = []
    unique_weeks = sorted(X_test[week_column].unique())

    for week in range(n_weeks_test):
        current_week = unique_weeks[week]
        current_data = X_test[X_test[week_column] <= current_week]
        y_test_pred = model.predict(current_data)
        test_predictions.extend(
            y_test_pred[-len(X_test[X_test[week_column] == current_week]) :]
        )

    y_test_pred = pd.Series(test_predictions)
    mae, rmse, r2 = evaluate_metrics(y_test, y_test_pred)
    return {"Test MAE": mae, "Test RMSE": rmse, "Test R2": r2}


def evaluate_models(
    models, X_train, y_train, X_val, y_val, X_test, y_test, week_column
):
    results = []

    for model_name, model in models.items():
        # Fit the model on training data
        model.fit(X_train, y_train)

        # Evaluate on train, validation, and test sets
        train_results = evaluate_train(model, X_train, y_train)
        val_results = evaluate_validation(model, X_val, y_val, week_column)
        test_results = evaluate_test(model, X_test, y_test, week_column)

        # Combine results
        model_results = {
            "Model": model_name,
            **train_results,
            **val_results,
            **test_results,
        }
        results.append(model_results)

    # Create DataFrame from results list
    results_df = pd.DataFrame(results)

    # Sort results by Test RMSE
    results_df = results_df.sort_values("Test RMSE")

    return results_df

# 4. Pick best one --> Optimize with grid search

In [None]:
# 	Get	feature	importances	from	the	model
feature_importances = best_model.get_feature_importance(prettified=False)

# 	Get	feature	names	(considering	potential	transformation)
feature_names = preprocessor.get_feature_names_out()  # 	After	column	transformation

# 	Sort	feature	importances	and	names	together	by	importance	(descending)
sorted_idx = np.argsort(feature_importances)
feature_importances = feature_importances[sorted_idx]
feature_names = feature_names[sorted_idx]

# 	Define	plot	size	and	create	a	bar	chart
plt.figure(figsize=(12, 6))
plt.barh(range(len(feature_names)), feature_importances, align="center")
plt.yticks(range(len(feature_names)), feature_names)
plt.xlabel("Feature	Importance")
plt.ylabel("Feature	Names")
plt.title("Feature	Importance	for	Electricity	Demand-Supply	Prediction")
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "depth": [4, 6, 8],
    "learning_rate": [0.05, 0.1, 0.2],
    "iterations": [50, 100, 200],
}

best_model = CatBoostRegressor()

grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

In [None]:
best_param

In [None]:
param_grid = {
    "svm__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "svm__gamma": [0.001, 0.01, 0.1, 1, 10, 100],
}
pipe = pipeline.Pipeline([("scaler", MinMaxScaler()), ("svm", SVC(C=100))])
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

Nested cross-validation
https://ml-course.github.io/master/notebooks/Tutorial%203%20-%20Machine%20Learning%20in%20Python.html#evaluate

In [None]:
scores = cross_val_score(
    GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5
)

In [None]:
scores = cross_val_score(
    GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5
)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())

In [None]:
df_eda.to_csv("final_model.csv", index=False)
# 	Save	the	trained	model
lr_model.save_model("catboost_model.cbm")

To-do: Residual analysis?
--> Check if errors are randomly distributed in pointcloud