# Corporacion Favorita - New Superb Forecasting Model - 

## Split and Model Pipeline

#codi

Made by 4B Consultancy (Janne Heuvelmans, Georgi Duev, Alexander Engelage, Sebastiaan de Bruin) - 2024

In this data pipeline, 

The following steps are made within this notebook:  

>-0. Import Packages 

>-1. Load final dataset and aggregate dataset to weekly level
    -1.1 Load final dataset made in Data Preperation Pipeline Notebook
    -1.2 Aggregate dataset to weekly level

>-2. Column transformers and Train, Test, Validation Split

>-3. Models

>-4. Pick best model one and optimize with grid search

## 0. Import Packages

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import polars as pl
import os
import sys
import altair as alt
import vegafusion as vf
import sklearn
import time
from datetime import date, datetime, timedelta
from sklearn.pipeline import Pipeline, make_pipeline

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import statsmodels.api as sm

In [3]:
from sklearn.model_selection import train_test_split

## 1. Load final dataset, Inpute Stockouts and Aggregate dataset to weekly level

### 1.1. Functions - Import raw data from local PATH
Create import data function and give basic information function within the importing function.

Return basic information on each dataframe:  
- a) Information on the number of observation and features.  
- b) Information on the size of the dataframe. 

TO-DO: Import via polars, and use polars dataframe?

In [4]:
def f_get_data_and_info(import_path, file_name="df_final"):

    print(f"\nReading file {file_name}\n")

    # Load data.
    df = pd.read_parquet(import_path + file_name + ".parquet")

    # Getting the basic information of the dataframe (number of observations and features, and size)
    print(
        f"The '{file_name}' dataframe contains: {df.shape[0]:,}".replace(",", ".")
        + f" observations and {df.shape[1]} features."
    )
    print(
        f"Prepared and transformed dataframe has optimized size of {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB."
    )

    return df

### 1.2. Importing raw data
Importing parquet files with importing function (giving basic information)

In [6]:
import_path = "C:/Users/alexander/Documents/0. Data Science and AI for Experts/EAISI_4B_Supermarket/data/processed/"


# import_path = "C:/Users/sebas/OneDrive/Documenten/GitHub/Supermarketcasegroupproject/Group4B/data/raw/


# Importing final df


df_final = f_get_data_and_info(import_path, file_name="Prepped_data_20241004")


Reading file Prepped_data_20241004

The 'Prepped_data_20241004' dataframe contains: 67.834.270 observations and 19 features.
Prepared and transformed dataframe has optimized size of 2.72 GB.


To-do: include null_count into importing OR make basic descrption function with features, size, null_count

In [7]:
df_final.info()
# Count nulls per column
null_counts = df_final.isnull().sum()

# Print results
for column, count in null_counts.items():
    print(f"Column '{column}' has {count} null values.")

<class 'pandas.core.frame.DataFrame'>
Index: 67834270 entries, 0 to 67834269
Data columns (total 19 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   store_nbr               uint8         
 1   item_nbr                int32         
 2   date                    datetime64[ns]
 3   unit_sales              float32       
 4   onpromotion             bool          
 5   holiday_local_count     int8          
 6   holiday_national_count  int8          
 7   holiday_regional_count  int8          
 8   store_type              category      
 9   store_cluster           uint8         
 10  item_family             category      
 11  item_class              uint16        
 12  perishable              uint8         
 13  store_status            int8          
 14  item_status             int8          
 15  year                    int16         
 16  weekday                 int8          
 17  week_nbr                int8          
 18  week_

In [8]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67834270 entries, 0 to 67834269
Data columns (total 19 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   store_nbr               uint8         
 1   item_nbr                int32         
 2   date                    datetime64[ns]
 3   unit_sales              float32       
 4   onpromotion             bool          
 5   holiday_local_count     int8          
 6   holiday_national_count  int8          
 7   holiday_regional_count  int8          
 8   store_type              category      
 9   store_cluster           uint8         
 10  item_family             category      
 11  item_class              uint16        
 12  perishable              uint8         
 13  store_status            int8          
 14  item_status             int8          
 15  year                    int16         
 16  weekday                 int8          
 17  week_nbr                int8          
 18  week_

## 2.0 Train test val split

SKtime

ExpandingWindowSplitter


#TO-DO: Selecting on weeks or via data?

In [10]:
features = [
    "store_nbr",
    "item_nbr",
    "onpromotion",
    "holiday_local_count",
    "holiday_national_count",
    "holiday_regional_count",
    "store_type",
    "store_cluster",
    "item_family",
    "item_class",
    "perishable",
    "store_status",
    "item_status",
    "year",
    # "week_nbr",
    "week_number_cum",
]

target_variable = ["unit_sales"]

# X = df[features]
# y = df[target_variable]

In [23]:
import sktime
from sktime.forecasting.model_selection import SingleWindowSplitter
import pandas as pd


def train_test_val_split(
    df, features, target_variable, train_week_end, test_week_end, window_length=52
):

    # Ensure the df is sorted by store_nbr, item_nbr, and week_number_cum

    df = df.sort_values(["store_nbr", "item_nbr", "week_number_cum"])

    # Create X and y
    X = df[features]
    y = df[target_variable]

    # Training data: From the first week to the `train_week_end`
    X_train = X[X["week_number_cum"] <= train_week_end]
    y_train = y[X["week_number_cum"] <= train_week_end]

    # Testing data: Sliding window of 12 months (52 weeks) from `train_week_end` to `test_week_end`
    test_fh = df[
        (df["week_number_cum"] > train_week_end)
        & (df["week_number_cum"] <= test_week_end)
    ]["week_number_cum"].nunique()

    test_splitter = SingleWindowSplitter(fh=test_fh, window_length=window_length)

    test_train_idx, test_test_idx = next(
        test_splitter.split(
            X[
                (X["week_number_cum"] > train_week_end)
                & (X["week_number_cum"] <= test_week_end)
            ]
        )
    )

    # Get the test set indices
    X_test = X.iloc[test_test_idx]
    y_test = y.iloc[test_test_idx]

    # Validation data: Sliding window of 12 months (52 weeks) after `test_week_end`
    val_fh = df[df["week_number_cum"] > test_week_end]["week_number_cum"].nunique()
    val_splitter = SingleWindowSplitter(fh=val_fh, window_length=window_length)

    val_train_idx, val_test_idx = next(
        val_splitter.split(X[X["week_number_cum"] > test_week_end])
    )

    # Get the validation set indices
    X_val = X.iloc[val_test_idx]
    y_val = y.iloc[val_test_idx]

    # Function to print split information
    def print_split_info(split_name, X_split, y_split):

        print(f"\n{split_name} set:")
        print(f"X_{split_name.lower()} shape: {X_split.shape}")
        print(f"y_{split_name.lower()} shape: {y_split.shape}")
        print(f"{split_name} Min Week: {X_split['week_number_cum'].min()}")
        print(f"{split_name} Max Week: {X_split['week_number_cum'].max()}")
        print(f"{split_name} number of weeks: {X_split['week_number_cum'].nunique()}")
        print(f"Number of stores: {X_split['store_nbr'].nunique()}")
        print(f"Number of items: {X_split['item_nbr'].nunique()}")

    # Print information about the splits
    print_split_info("Train", X_train, y_train)
    print_split_info("Test", X_test, y_test)
    print_split_info("Validation", X_val, y_val)

    return X_train, y_train, X_test, y_test, X_val, y_val

ImportError: cannot import name 'SingleWindowSplitter' from 'sktime.forecasting.model_selection' (c:\Users\alexander\Documents\0. Data Science and AI for Experts\EAISI_4B_Supermarket\venv_case_project\Lib\site-packages\sktime\forecasting\model_selection\__init__.py)

In [25]:
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(
    df_final, features, target_variable, train_week_end=156, test_week_end=208
)

TypeError: train_test_val_split() got an unexpected keyword argument 'train_week_end'

In [None]:
# i am still not content with the train,test,val timeseries splitter using sktime. I want a function where i can select train_week_end=156, test_week_end=208 within the function. The train df should run from first week to train_week_end. The test dataframe should be a sliding window of 12 months, to test only the performance of train_week_end week 156> until week test_week_end=208. The  valdataframe should be a sliding window of 12 months, to test only the performance of week 208 until end dataframe

In [None]:
def print_split_info(split_name, X_split, y_split):
    print(f"\n{split_name} set:")
    print(f"X_{split_name.lower()} shape: {X_split.shape}")
    print(f"y_{split_name.lower()} shape: {y_split.shape}")
    print(f"{split_name} Min Week: {X_split['week_number_cum'].min()}")
    print(f"{split_name} Max Week: {X_split['week_number_cum'].max()}")
    print(f"{split_name} number of weeks: {X_split['week_number_cum'].nunique()}")
    print(f"Number of stores: {X_split['store_nbr'].nunique()}")
    print(f"Number of items: {X_split['item_nbr'].nunique()}")

In [22]:
def train_test_val_split(df, train_end, test_end):

    # Ensure the df is sorted by date
    df = df.sort_index()

    # Split the df in train, test en validation subsets
    train = df[:train_end]
    test = df[train_end:test_end]
    val = df[test_end:]

    # Create X and y for each subset
    X_train, y_train = train[features], train[target_variable]
    X_test, y_test = test[features], test[target_variable]
    X_val, y_val = val[features], val[target_variable]

    # Print information about the splits
    print("Train set:")
    print("X_train shape: {}".format(X_train.shape))
    print("y_train shape: {}".format(y_train.shape))
    print("Training Min Date: {}".format(X_train.index.min()))
    print("Training Max Date: {}".format(X_train.index.max()))

    print("\nTest set:")
    print("X_test shape: {}".format(X_test.shape))
    print("y_test shape: {}".format(y_test.shape))
    print("Test Min Date: {}".format(X_test.index.min()))
    print("Test Max Date: {}".format(X_test.index.max()))

    print("\nValidation set:")
    print("X_val shape: {}".format(X_val.shape))
    print("y_val shape: {}".format(y_val.shape))
    print("Validation Min Date: {}".format(X_val.index.min()))
    print("Validation Max Date: {}".format(X_val.index.max()))

    return X_train, y_train, X_test, y_test, X_val, y_val

To-do: do we split based on dates or based on weeks since start?

In [21]:
X_train, y_train, X_test, y_test, X_val, y_val = train_test_val_split(
    df_final, train_end="2016-06-01", test_end="2017-01-01"
)

NameError: name 'train_test_val_split' is not defined

## 3.0 Functions - Impute stockouts and Aggregate dataset to weekly level


#### 3.1. Impute stockouts

Stockout on store level

•      Perishable good: when there are missing values for two consecutive days for a given item per individual store 

•      Nonperishable goods: when there are missing values for 7 consecutive days for a given item and per individual store

•      Action: Impute with Rolling Mean with defeault window of 7 days 

------------------------------------

In [None]:
def impute_stockouts_polars(df_pandas, window_size=7):

    # Convert the input Pandas DataFrame to a Polars DataFrame for efficient processing

    df = pl.from_pandas(df_pandas)

    # Sort the DataFrame by store number, item number, and date for consistent ordering

    df = df.sort(["store_nbr", "item_nbr", "date"])

    # Nested function calc_missing_count to calculate the count of consecutive missing values in unit_sales

    def calc_missing_count(unit_sales):

        return (
            unit_sales.is_null()  # Check for null values
            .cast(pl.Int32)  # Cast to integer (1 for null, 0 for not null)
            .cum_sum()  # Cumulative sum to count sequential nulls
            .over(["store_nbr", "item_nbr"])  # Group by store_nbr and item_nbr
        )

    # Nested function to Inpute with rolling mean for missing values
    def rolling_mean_imputation(unit_sales, window_size):

        return (
            unit_sales.rolling_mean(
                window_size=window_size, min_periods=1
            )  # Impute strategy based on rolling mean
            .shift(
                1
            )  # Shift window by one day, to prevent taking the same day into account
            .over(["store_nbr", "item_nbr"])  # Group by store_nbr and item_nbr
        )

    # Apply the imputation logic based on the perishable status of the items

    df = df.with_columns(
        [
            pl.when(pl.col("perishable") == 1)  # Check if the item is perishable = 1
            .then(
                pl.when(
                    calc_missing_count(pl.col("unit_sales")) == 1
                )  # 1 missing value
                .then(0)  # --> Impute with 0
                .when(
                    calc_missing_count(pl.col("unit_sales")) > 2
                )  # More than 2 missing values
                .then(0)  # --> Impute with 0
                .when(
                    calc_missing_count(pl.col("unit_sales")) == 2
                )  # = 2 missing values
                .then(
                    rolling_mean_imputation(pl.col("unit_sales"), window_size)
                )  # --> Inpute with rolling mean for 2 missing days
                .otherwise(pl.col("unit_sales"))  # Otherwise keep original value
            )
            .when(pl.col("perishable") == 0)  # If the item is not perishable = 0
            .then(
                pl.when(
                    calc_missing_count(pl.col("unit_sales")) > 7
                )  # More than 7 missing values
                .then(0)  # --> Impute with 0
                .when(
                    calc_missing_count(pl.col("unit_sales")) <= 7
                )  # if less 7 missing values
                .then(
                    rolling_mean_imputation(pl.col("unit_sales"), window_size)
                )  # --> Inpute with rolling mean for missing 7 or less days
                .otherwise(pl.col("unit_sales"))  # Otherwise keep original value
            )
            .otherwise(pl.col("unit_sales"))  # For any other case not covered
            .alias("unit_sales")  # Alias the new column as 'unit_sales'
        ]
    )

    # Convert Polars df back to Pandas df
    df = df.to_pandas()

    return df

In [None]:
# def impute_stockouts_(df):

#     df = impute_stockouts_polars(df, window_size=7)

#     return df

## 3.2. Aggregate dataset to weekly level

- Group the DataFrame by store number, item number, year, and week_cum_number, then aggregate the columns
--> "unit_sales","onpromotion", "holiday_local_count","holiday_regional_count","holiday_national_count",


In [17]:
def aggregate_week(df):

    # Pre-sort the DataFrame
    df = df.sort_values(["store_nbr", "item_nbr", "year", "week_nbr"])

    # Group by the specified columns and aggregate
    df = (
        df.groupby(
            [
                "store_nbr",
                "item_nbr",
                "year",
                "week_number_cum",  # Aggregating by week_number_cum
            ]
        )
        .agg(
            {
                "unit_sales": "sum",
                "onpromotion": "sum",
                "holiday_local_count": "sum",
                "holiday_regional_count": "sum",
                "holiday_national_count": "sum",
                "date": "first",  # Keep the first day of week, needed to run Timeseries models from SKtime
                "store_type": "first",  # Keep the first occurrence of store_type
                "store_cluster": "first",  # Keep the first occurrence of store_cluster
                "item_family": "first",  # Keep the first occurrence of item_family
                "item_class": "first",  # Keep the first occurrence of item_class
                "perishable": "first",  # Keep the first occurrence of perishable
                "store_status": "last",  # Keep the last occurrence of store_status
                "item_status": "last",  # Keep the last occurrence of item_status
            }
        )
        .reset_index()
    )

    # Define the columns weekday' to drop
    # columns_to_drop = ["weekday"]
    # df = df.drop(columns=columns_to_drop)

    return df

In [18]:
df_agg = aggregate_week(df_final)

df_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9730820 entries, 0 to 9730819
Data columns (total 17 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   store_nbr               uint8         
 1   item_nbr                int32         
 2   year                    int16         
 3   week_number_cum         int16         
 4   unit_sales              float32       
 5   onpromotion             int64         
 6   holiday_local_count     int8          
 7   holiday_regional_count  int8          
 8   holiday_national_count  int8          
 9   date                    datetime64[ns]
 10  store_type              category      
 11  store_cluster           uint8         
 12  item_family             category      
 13  item_class              uint16        
 14  perishable              uint8         
 15  store_status            int8          
 16  item_status             int8          
dtypes: category(2), datetime64[ns](1), float32(1),

In [19]:
df_agg.head(10)

Unnamed: 0,store_nbr,item_nbr,year,week_number_cum,unit_sales,onpromotion,holiday_local_count,holiday_regional_count,holiday_national_count,date,store_type,store_cluster,item_family,item_class,perishable,store_status,item_status
0,1,96995,2013,1,0.0,0,0,0,1,2013-01-02,D,13,GROCERY I,1093,0,0,3
1,1,96995,2013,2,2.0,0,0,0,1,2013-01-07,D,13,GROCERY I,1093,0,0,1
2,1,96995,2013,3,3.0,0,0,0,0,2013-01-14,D,13,GROCERY I,1093,0,0,1
3,1,96995,2013,4,2.0,0,0,0,0,2013-01-21,D,13,GROCERY I,1093,0,0,1
4,1,96995,2013,5,2.0,0,0,0,0,2013-01-28,D,13,GROCERY I,1093,0,0,1
5,1,96995,2013,6,1.0,0,0,0,0,2013-02-04,D,13,GROCERY I,1093,0,0,1
6,1,96995,2013,7,4.0,0,0,0,2,2013-02-11,D,13,GROCERY I,1093,0,0,1
7,1,96995,2013,8,2.0,0,0,0,0,2013-02-18,D,13,GROCERY I,1093,0,0,1
8,1,96995,2013,9,2.0,0,0,0,0,2013-02-25,D,13,GROCERY I,1093,0,0,1
9,1,96995,2013,10,5.0,0,0,0,0,2013-03-04,D,13,GROCERY I,1093,0,0,1


In [None]:
def aggregate_week_polars(df):

    # Convert to a Polars DataFrame
    df = pl.DataFrame(df)

    # Group by and aggregate, use .alias ['column'] to specify column-name
    df = df.groupby(["store_nbr", "item_nbr", "year", "week_number_cum"]).agg(
        [
            pl.col("unit_sales").sum().alias("unit_sales"),
            pl.col("onpromotion").sum().alias("onpromotion"),
            pl.col("holiday_local_count").sum().alias("holiday_local_count"),
            pl.col("holiday_regional_count").sum().alias("holiday_regional_count"),
            pl.col("holiday_national_count").sum().alias("holiday_national_count"),
            pl.col("date")
            .first()  # Keep the first day of week, needed to run Timeseries models from SKtime
            .alias("date"),
            pl.col("store_type")
            .first()  # Keep the first occurrence of store_type
            .alias("store_type"),
            pl.col("store_cluster")
            .first()  # Keep the first occurrence of store_cluster
            .alias("store_cluster"),
            pl.col("item_family")
            .first()  # Keep the first occurrence of item_family
            .alias("item_family"),
            pl.col("item_class")
            .first()  # Keep the first occurrence of item_class
            .alias("item_class"),
            pl.col("perishable")
            .first()  # Keep the first occurrence of perishable
            .alias("perishable"),
            pl.col("store_status")
            .last()  # Keep the last occurrence of store_status
            .alias("store_status"),
            pl.col("item_status")
            .last()  # Keep the last occurrence of item_status
            .alias("item_status"),
        ]
    )

    # Convert Polars df back to Pandas df
    df = df.to_pandas

    return df

## 4. Column transformers

### 4.1. Column transformers

In [None]:
features = [
    "store_nbr",
    "item_nbr",
    "onpromotion",
    "holiday_local_count",
    "holiday_national_count",
    "holiday_regional_count",
    "store_type",
    "store_cluster",
    "item_family",
    "item_class",
    "perishable",
    "store_status",
    "item_status",
    "year",
    "week_nbr",
    "week_number_cum",
]

target_variable = ["unit_sales"]

X = df[features]
y = df[target_variable]

To-do: Do we need onehotencoder? --> then needed to seperate between timeseries en ML models

To-do: Change catagory dtypes from store_type and item_family just to numbers in prep pipeline?

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

# Create your transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

# Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        (
            "ImputeStockouts",  # Impute stockouts
            FunctionTransformer(impute_stockouts_polars),
            num_features.tolist() + cat_features.tolist(),
        ),
        (
            "AggregateWeek",  # Aggregate dataset to weekly level
            FunctionTransformer(aggregate_week_polars),
            num_features.tolist() + cat_features.tolist(),
        ),
        ("StandardScaler", StandardScaler(), num_features),  # To-do: --> needed?????
    ]
)

# Fit and transform the data
X_transformed = preprocessor.fit_transform(X)

In [None]:
# 3.1.1. ForecastingGridSearchCV

## 5. Models

### 5.1. Models list to compare in model

In [None]:
# <PATH>.\venv_case_project\Scripts\activate

# source venv_macbook/bin/activate #for Georgi ;)

# pip install sktime
# pip install statsmodels
# pip install xgboost

In [None]:
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.compose import make_reduction
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models = {
    "Naive": NaiveForecaster(strategy="last"),
    "Simple Moving Average": NaiveForecaster(strategy="mean", window_length=6),
    "Holt-Winters": ExponentialSmoothing(trend="add", seasonal="add", sp=52),
    "Random Forest Regressor": make_reduction(
        RandomForestRegressor(), window_length=13, strategy="recursive"
    ),
    "XGBoost": make_reduction(XGBRegressor(), window_length=13, strategy="recursive"),
}

### 5.2. Evaulation Metrics and Evaluate Model functions

In [1]:
from sklearn.metrics import (
    mean_absolute_percentage_error,
    mean_absolute_error,
    r2_score,
)


def forecast_accuracy(y_true, y_pred):

    # Calculate the absolute differences between true and predicted values
    absolute_errors = np.abs(y_true - y_pred)

    # Calculate 10% of the absolute true values
    tolerance = 0.1 * np.abs(y_true)

    # Check if the absolute errors are within 10% of the true values
    within_tolerance = absolute_errors <= tolerance

    # Calculate and return the mean of the boolean array
    # (True is treated as 1 and False as 0), which gives the proportion of accurate forecasts
    return np.mean(within_tolerance)


def evaluate_model(y_true, y_pred):

    mape = mean_absolute_percentage_error(y_true, y_pred)
    accuracy = forecast_accuracy(y_true, y_pred)
    bias = np.mean(y_true - y_pred)

    return mape, accuracy, bias

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score


def evaluate_forecast(
    y_true, y_pred, scaler=None, tolerance=0.1
):  # scaler=StandardScaler()

    y_true, y_pred = np.array(y_true), np.array(y_pred)

    # # If scaler is provided --> Inverse transform the data, because of the StandardScaler()
    if scaler:
        y_true = scaler.inverse_transform(y_true.reshape(-1, 1)).flatten()
        y_pred = scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()

    # Calculate absolute errors and tolerance
    absolute_errors = np.abs(y_true - y_pred)
    tolerance_values = tolerance * np.abs(y_true)

    # Calculate metrics
    mape = mean_absolute_percentage_error(y_true, y_pred)
    bias = np.mean(y_true - y_pred)

    # Calculate accuracy (within tolerance)
    within_tolerance = absolute_errors <= tolerance_values
    accuracy = np.mean(within_tolerance)

    # Calculate direction accuracy
    direction_correct = np.sign(y_true[1:] - y_true[:-1]) == np.sign(
        y_pred[1:] - y_pred[:-1]
    )
    direction_accuracy = np.mean(direction_correct)

    return {
        "MAPE": mape,
        "Bias": bias,
        "Accuracy (within {}% tolerance)".format(tolerance * 100): accuracy,
        "Direction Accuracy": direction_accuracy,
    }

Run / Fit model

In [None]:
def evaluate_models(models, X_train, y_train, X_val, y_val, X_test, y_test):

    # Initialize an empty list to store results
    results = []

    for model_name, model in models.items():
        model.fit(X_train, y_train)

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)

        # Evaluate Train, Validation, and Test dataset
        model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(
            y_train, y_train_pred
        )
        model_val_mae, model_val_rmse, model_val_r2 = evaluate_model(y_val, y_val_pred)
        model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(
            y_test, y_test_pred
        )

        # Append results to list
        results.append(
            {
                "Model": model_name,
                "Train RMSE": model_train_rmse,
                "Train MAE": model_train_mae,
                "Train R2": model_train_r2,
                "Validation RMSE": model_val_rmse,
                "Validation MAE": model_val_mae,
                "Validation R2": model_val_r2,
                "Test RMSE": model_test_rmse,
                "Test MAE": model_test_mae,
                "Test R2": model_test_r2,
            }
        )

    # Create DataFrame from results list
    results_df = pd.DataFrame(
        results,
        columns=[
            "Model",
            "Train RMSE",
            "Train MAE",
            "Train R2",
            "Validation RMSE",
            "Validation MAE",
            "Validation R2",
            "Test RMSE",
            "Test MAE",
            "Test R2",
        ],
    )

    # Sort results by Test RMSE
    results_df = results_df.sort_values("Test RMSE")

    return results_df

To-do: how to make test forecast per week, so 26 iternations of weeks 1 for one?

we now have a good function. however the test (26 weeks) and validation (~26 weeks) shoukd not be done all at once for all the weks. But iterative week for week, as it is for forecasting store sales, and the sales of next week are in reality not known in advance. So we think we want it to loop the test and val set week for week. But dont want do add the val or test set to the training data, to prevent data drift

Test Claude

In [None]:
from sktime.performance_metrics.forecasting import (
    mean_absolute_percentage_error,
    mean_absolute_scaled_error,
)


def evaluate_models(models, y_train, y_val, y_test, fh):
    results = []

    for model_name, model in models.items():
        model.fit(y_train)

        y_val_pred = model.predict(fh[: len(y_val)])
        y_test_pred = model.predict(fh)

        val_mape, val_accuracy, val_bias = evaluate_model(y_val, y_val_pred)
        test_mape, test_accuracy, test_bias = evaluate_model(y_test, y_test_pred)

        results.append(
            {
                "Model": model_name,
                "Validation MAPE": val_mape,
                "Validation Accuracy": val_accuracy,
                "Validation Bias": val_bias,
                "Test MAPE": test_mape,
                "Test Accuracy": test_accuracy,
                "Test Bias": test_bias,
            }
        )

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values("Test MAPE")

    return results_df


# Usage example:
# Assuming you have your data split into y_train, y_val, y_test
# and a forecast horizon fh defined

# models = create_models()
# results = evaluate_models(models, y_train, y_val, y_test, fh)
# print(results)

## X. Pick best one --> Optimize with grid search

In [None]:
# 	Get	feature	importances	from	the	model
feature_importances = best_model.get_feature_importance(prettified=False)

# 	Get	feature	names	(considering	potential	transformation)
feature_names = preprocessor.get_feature_names_out()  # 	After	column	transformation

# 	Sort	feature	importances	and	names	together	by	importance	(descending)
sorted_idx = np.argsort(feature_importances)
feature_importances = feature_importances[sorted_idx]
feature_names = feature_names[sorted_idx]

# 	Define	plot	size	and	create	a	bar	chart
plt.figure(figsize=(12, 6))
plt.barh(range(len(feature_names)), feature_importances, align="center")
plt.yticks(range(len(feature_names)), feature_names)
plt.xlabel("Feature	Importance")
plt.ylabel("Feature	Names")
plt.title("Feature	Importance	for	Electricity	Demand-Supply	Prediction")
plt.grid(axis="x", linestyle="--", alpha=0.6)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "depth": [4, 6, 8],
    "learning_rate": [0.05, 0.1, 0.2],
    "iterations": [50, 100, 200],
}

best_model = CatBoostRegressor()

grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

In [None]:
best_param

In [None]:
param_grid = {
    "svm__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "svm__gamma": [0.001, 0.01, 0.1, 1, 10, 100],
}
pipe = pipeline.Pipeline([("scaler", MinMaxScaler()), ("svm", SVC(C=100))])
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

Nested cross-validation
https://ml-course.github.io/master/notebooks/Tutorial%203%20-%20Machine%20Learning%20in%20Python.html#evaluate

In [None]:
scores = cross_val_score(
    GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5
)

In [None]:
scores = cross_val_score(
    GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5
)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())

In [None]:
df_eda.to_csv("final_model.csv", index=False)
# 	Save	the	trained	model
lr_model.save_model("catboost_model.cbm")

To-do: Residual analysis?
--> Check if errors are randomly distributed in pointcloud