# Data Preperation Pipeline

### Hamilton Framework
https://hamilton.dagworks.io/en/latest/how-tos/use-in-jupyter-notebook/ 

### 0. Import Packages

In [1]:
import pandas as pd
import numpy as np

import sys
import altair as alt

import vegafusion as vf

import sklearn

from datetime import datetime, timedelta
from sklearn.pipeline import Pipeline, make_pipeline

### 1.1 Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [2]:
def standardize_column_names(s):
    """Removes spaces from the column names."""
    return s.replace(" ", "")


def optimize_memory(df):
    """Optimize memory usage of a DataFrame by converting object columns to categorical
    and downcasting numeric columns to smaller types."""

    # Change: Objects to Categorical.
    object_cols = df.select_dtypes(include="object").columns
    if not object_cols.empty:
        print("Change: Objects to Categorical")
        df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest signed or unsigned integer and floats to smallest.
    for col in df.select_dtypes(include=["int"]).columns:
        if (df[col] >= 0).all():  # Check if all values are non-negative
            df[col] = pd.to_numeric(
                df[col], downcast="unsigned"
            )  # Downcast to unsigned
        else:
            df[col] = pd.to_numeric(df[col], downcast="integer")  # Downcast to signed

    # Downcast float columns
    for col in df.select_dtypes(include=["float"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    return df


def month_year_to_int(df, i):

    # Change: Month and Year to integer.

    if i == 0:

        print("Change: Month and Year to integer")

        df = df.astype({"month": int, "year": int})

    return df


# Transform date-related columns to datetime format.


def transform_date_to_datetime(df, i):

    if i == 0:

        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")

        df["date"] = pd.to_datetime(df[["year", "month", "day"]], unit="us")

    else:
        if "date" in df.columns:

            print("Change: Transformed 'date' column to Datetime Dtype")

            df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

    return df

In [3]:
def df_basic_info_before(df):
    print(
        f"-> Contains:                {df.shape[0]} observations and {df.shape[1]} features."
    )
    print(
        f"-> Has original size of    {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB."
    )


def df_basic_info_after(df):
    print(
        f"-> Contains:                {df.shape[0]} observations and {df.shape[1]} features."
    )
    print(
        f"-> Has optimized size of    {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB."
    )

### 1.2 Import data from local PATH
Import data trough pipeline to downcast the data and transformation to datetime dtype

In [4]:
def f_get_data(i=0):

    # Define path.
    c_path = "C:/Users/sebas/OneDrive/Documenten/GitHub/Supermarketcasegroupproject/Group4B/data/raw/"

    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "holidays_events",  # 1
        "items",  # 2
        "stores",  # 3
    )

    print(f"\nReading file {i}\n")

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(month_year_to_int, i)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### 1.3 Importing data

In [5]:
# To-do: write this in function. But where executed? In the end ?

# Sales History per year

df_sales = f_get_data(0)

df_basic_info_after(df_sales)


# Holidays

df_holidays = f_get_data(1)

df_basic_info_after(df_holidays)


# Items

df_items = f_get_data(2)

df_basic_info_after(df_items)


# Stores

df_stores = f_get_data(3)

df_basic_info_after(df_stores)


Reading file 0

Change: Month and Year to integer
Change: Transformed 'year', 'month', 'day' columns to Datetime feature
-> Contains:                125497040 observations and 9 features.
-> Has optimized size of    3.74 GB.

Reading file 1

Change: Objects to Categorical
Change: Transformed 'date' column to Datetime Dtype
-> Contains:                350 observations and 6 features.
-> Has optimized size of    0.0 GB.

Reading file 2

Change: Objects to Categorical
-> Contains:                4100 observations and 4 features.
-> Has optimized size of    0.0 GB.

Reading file 3

Change: Objects to Categorical
-> Contains:                54 observations and 5 features.
-> Has optimized size of    0.0 GB.


## 2.0 Exclude Stores + Vulcano Eruption holiday + Items

#### 2.1 Return list containing stores with less then 1670 operational days with sales

parameter: store_exclusion_cutoff_number = 1670 days

In [6]:
def stores_exclude_sales_days(df_sales, df_stores, store_exclusion_cutoff_number=1670):

    # Group the sales date by store and item
    df_sales_grouped = (
        df_sales.groupby(["store_nbr", "date"]).agg({"unit_sales": "sum"}).reset_index()
    )

    # Merge the grouped sales data with the store data
    df_sales_stores_merged = df_sales_grouped.merge(
        df_stores, left_on="store_nbr", right_on="store_nbr", how="inner"
    )

    # Count the number of daily sale records per store
    store_count = df_sales_stores_merged["store_nbr"].value_counts()

    # Get stores with counts less than the exclusion cutoff
    store_count_exclusion = store_count[store_count < store_exclusion_cutoff_number]

    # Get the list of store numbers to be excluded
    list_excluded_stores_sales_days = store_count_exclusion.index.tolist()

    return list_excluded_stores_sales_days

In [7]:
stores_exclude_sales_days(
    df_sales, df_stores, store_exclusion_cutoff_number=1670
)  # --> [30, 14, 12, 25, 24, 18, 36, 53, 20, 29, 21, 42, 22, 52]

[30, 14, 12, 25, 24, 18, 36, 53, 20, 29, 21, 42, 22, 52]

#### 2.2 Return list containing stores with cluster=10 in stores df

In [8]:
def stores_exclude_cluster(df_stores, cluster_number=10):

    # Get the list of store numbers that belong to cluster 10

    list_stores_cluster_10 = df_stores[df_stores["cluster"] == cluster_number][
        "store_nbr"
    ].tolist()

    return list_stores_cluster_10

In [9]:
stores_exclude_cluster(df_stores, cluster_number=10)  # --> [26, 28, 29, 31, 36, 43]

[26, 28, 29, 31, 36, 43]

##### 2.3 Function to exclude stores with less then 1670 sales days and related to cluster 10 

In [10]:
def df_sales_cleaned_stores(df_sales, store_exclusion_cutoff_number=1670):

    # Excluded less then 1670 salesdays
    list_excluded_stores_sales_days = stores_exclude_sales_days(
        df_sales, df_stores, store_exclusion_cutoff_number
    )

    df_sales = df_sales.drop(
        df_sales[df_sales["store_nbr"].isin(list_excluded_stores_sales_days)].index
    )

    # Cluster 10
    list_stores_cluster_10 = stores_exclude_cluster(df_stores, cluster_number=10)

    df_sales = df_sales.drop(
        df_sales[df_sales["store_nbr"].isin(list_stores_cluster_10)].index
    )

    return df_sales

In [11]:
# Sebastiaan -> eigen stukje code

list_stores_cluster_10e = stores_exclude_cluster(df_stores, cluster_number=10)

type(list_stores_cluster_10e)

list

In [12]:
# Execution of final function --> In pipeline
df_sales = df_sales_cleaned_stores(df_sales)

##### 2.4 Filter Vulcano Eruption from holiday df

In [13]:
# Sebastiaan code

# with pd.option_context('display.max_rows', None):
#     display(df_holidays)


In [14]:
def holiday_filter_vulcano_event(df_holidays, event_substring="Terremoto Manabi"):

    # Filter the DataFrame where 'description' contains the event_substring
    df_vulcano_event_filtered = df_holidays[
        df_holidays["description"].str.contains(event_substring)
    ]

    return df_vulcano_event_filtered

In [15]:
def df_holidays_cleaned(df_holidays):

    # Exclude holiday_filter_vulcano_event function to return filtered df
    df_vulcano_event_filtered = holiday_filter_vulcano_event(df_holidays)

    # Filter the specific holiday events from the holiday DataFrame
    df_holidays = df_holidays.loc[
        ~df_holidays.index.isin(df_vulcano_event_filtered.index)
    ]

    return df_holidays

In [16]:
# Execution of final function --> In pipeline?
df_holidays = df_holidays_cleaned(df_holidays)

2.5 Filter and exclude of Items

In [17]:
##Orginal, first try and check on item level


def item_check(start_date, x_days):

    # start_date = "2013-02-01"
    # x_days = 31

    # Convert start_date to datetime
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    print(start_date)

    # Calculate end_date
    end_date = start_date + timedelta(days=x_days)
    print(x_days)
    print(end_date)

    # Filter the DataFrame based on the date range
    df_sales_filtered = df_sales[
        (df_sales["date"] >= start_date) & (df_sales["date"] <= end_date)
    ]

    # Group by item_nbr and sum unit_sales, so this will be the same criteria for all stores.
    df_sales_item = (
        df_sales_filtered.groupby("item_nbr").agg({"unit_sales": "sum"}).reset_index()
    )

    # Get the list of store numbers to stay included, as they have sales within the first 28 days
    list_sales_items = df_sales_item["item_nbr"].tolist()
    unique_values_count = df_sales_item["item_nbr"].nunique()
    print(unique_values_count)

    # If sum_sales = 0 --> drop item

    # first 4 weeks
    # last 4 weeks

    # sum_total per item for these two months

In [18]:
# item_check("2016-01-01", 31)

# TO-DO- WRITE ITEM EXLUSION PART

If no sum_sales of unique item_nbr per unique store_nbr after 30-07-2017 --> exclude

In [19]:
cutoff_date = "30-07-2017"

# # Convert start_date to datetime
# start_date = datetime.strptime(start_date, "%Y-%m-%d")
# print(start_date)

# # Calculate end_date
# end_date = start_date + timedelta(days=x_days)
# print(x_days)
# print(end_date)

# # Filter the DataFrame based on the date range
# df_sales_filtered = df_sales[
#     (df_sales["date"] >= start_date) & (df_sales["date"] <= end_date)
# ]

# # Change the dtype for item_nbr from uint32 to int32
# df_sales["item_nbr"] = df_sales["item_nbr"].astype(int)
# df_items["item_nbr"] = df_items["item_nbr"].astype(int)

# # Merge the filtered sales data with the items data
# df_sales_items_merged = df_sales.merge(df_items, on="item_nbr", how="left")

# # print(df_sales_items_merged.info())
# # print(df_sales_items_merged.sample(5))

# df_sales_items_merged["class"] = df_sales_items_merged["class"].astype(str)

# # Group by item_nbr and sum unit_sales, so this will be the same criteria for all stores.
# df_sales_item = (
#     df_sales_filtered.groupby("class").agg({"unit_sales": "sum"}).reset_index()
# )

# # Get the list of store numbers to stay included, as they have sales within the first 28 days
# list_sales_items = df_sales_item["class"].tolist()
# unique_values_count = df_sales_item["class"].nunique()
# print(unique_values_count)

In [20]:
# test function for items
def get_unique(df, column_name):
    """Get the all values and the count for specific column"""

    unique_values_count = df[column_name].nunique()

    unique_values = df[column_name].unique()


    # Convert unique values to a single string to print

    unique_values_str = ", ".join(map(str, unique_values))


    print(f"Number of unique values in {column_name}: {unique_values_count}")

    print("Unique values:")

    print(unique_values_str)

    return

### 3.0 Prepare and Merge df_sales + df_items + df_stores + df_holidays

3.1 Prepare and clean df_sales

Drop of columns "id", "year", "month", "day"

In [21]:
# Prepare df_sales by cleaning up df for merging with holidays by dropping unneeded columns
def sales_cleaned(df_sales):

    df_sales = df_sales.drop(columns=["id", "year", "month", "day"])

    return df_sales

3.2 Prepare, clean and rename df_items

Rename of columns: "family" to "item_family" and  "class" to "item_class"

In [22]:
# Prepare df_items by cleaning up df by dropping unneeded columns and rename columns for clearity in final df
def items_cleaned_renamed(df_items):

    df_items = df_items.rename(columns={"family": "item_family", "class": "item_class"})

    return df_items

3.3 Prepare, clean and rename df_stores

Drop of columns "state"

Rename of columns "city" to "store_city", "cluster" to "store_cluster" and "type" to "store_type"


In [23]:
# Prepare df_stores by cleaning up df by dropping unneeded columns and rename columns for clearity in final df
def stores_cleaned_renamed(df_stores):

    df_stores = df_stores.drop(columns=["state"])

    df_stores = df_stores.rename(
        columns={"city": "store_city", "cluster": "store_cluster", "type": "store_type"}
    )

    return df_stores

3.2 Prepare df_holidays

In [24]:
# Prepare df_holiday and df_stores by cleaning up df for merging with holidays by dropping unneeded columns
def clean_holidays_stores_prep(df_holidays, df_stores):

    df_holidays_cleaned = df_holidays.drop(
        columns=[
            "description",
            "transferred",
        ]
    )

    df_stores_cleaned = df_stores.drop(columns=["cluster", "type"])

    return df_holidays_cleaned, df_stores_cleaned

In [25]:
def holidays_prep_local(df_holidays, df_stores):

    df_holidays_cleaned, df_stores_cleaned = clean_holidays_stores_prep(
        df_holidays, df_stores
    )

    # select locale 'Local' from holiday df and merge with city stores df
    df_holidays_local = df_holidays_cleaned[df_holidays_cleaned["locale"] == "Local"]

    df_holidays_prep_local = df_holidays_local.merge(
        df_stores_cleaned, left_on="locale_name", right_on="city", how="left"
    )

    return df_holidays_prep_local

In [26]:
#Sebastiaan code
df_holidays_prep_local = holidays_prep_local(df_holidays, df_stores)

df_holidays_prep_local_filtered = df_holidays_prep_local[df_holidays_prep_local["store_nbr"] == 51]

df_holidays_prep_local_filtered.head(30)

Unnamed: 0,date,type,locale,locale_name,store_nbr,city,state
92,2013-07-24,Additional,Local,Guayaquil,51,Guayaquil,Guayas
100,2013-07-25,Holiday,Local,Guayaquil,51,Guayaquil,Guayas
175,2014-07-24,Additional,Local,Guayaquil,51,Guayaquil,Guayas
183,2014-07-25,Holiday,Local,Guayaquil,51,Guayaquil,Guayas
258,2015-07-24,Holiday,Local,Guayaquil,51,Guayaquil,Guayas
266,2015-07-25,Holiday,Local,Guayaquil,51,Guayaquil,Guayas
341,2016-07-24,Additional,Local,Guayaquil,51,Guayaquil,Guayas
349,2016-07-24,Transfer,Local,Guayaquil,51,Guayaquil,Guayas
357,2016-07-25,Holiday,Local,Guayaquil,51,Guayaquil,Guayas
435,2017-07-24,Additional,Local,Guayaquil,51,Guayaquil,Guayas


In [27]:
unique_types = df_holidays_prep_local['type'].unique().tolist()

unique_types

['Holiday', 'Additional', 'Transfer']

In [28]:
def holidays_prep_regional(df_holidays, df_stores):

    df_holidays_cleaned, df_stores_cleaned = clean_holidays_stores_prep(
        df_holidays, df_stores
    )

    # select locale 'Regional' from holiday df and merge with state stores df
    df_holidays_regional = df_holidays_cleaned[
        df_holidays_cleaned["locale"] == "Regional"
    ]

    df_holidays_prep_regional = df_holidays_regional.merge(
        df_stores_cleaned, left_on="locale_name", right_on="state", how="left"
    )

    return df_holidays_prep_regional

In [29]:
#Sebastiaan code
df_holidays_prep_regional = holidays_prep_regional(df_holidays, df_stores)

df_holidays_prep_regional_filtered = df_holidays_prep_regional[df_holidays_prep_regional["store_nbr"] == 27]

df_holidays_prep_regional_filtered.head(30)

Unnamed: 0,date,type,locale,locale_name,store_nbr,city,state


In [30]:
def holidays_prep_national(df_holidays, df_stores):

    df_holidays_cleaned, df_stores_cleaned = clean_holidays_stores_prep(
        df_holidays, df_stores
    )

    # Select locale 'Regional' from holiday df and merge with national stores df
    df_holidays_national = df_holidays_cleaned[
        df_holidays_cleaned["locale"] == "National"
    ]

    # Create extra column for merge on "Ecuador"
    df_stores_cleaned["national_merge"] = "Ecuador"

    df_holidays_prep_national = df_holidays_national.merge(
        df_stores_cleaned, left_on="locale_name", right_on="national_merge", how="left"
    )

    # Drop newly created column national_merge, not needed further
    df_holidays_prep_national = df_holidays_prep_national.drop(
        columns=["national_merge"]
    )

    return df_holidays_prep_national

In [31]:
# Sebastiaan code

df_holidays_prep_national = holidays_prep_national(df_holidays, df_stores)

df_holidays_prep_national_filtered = df_holidays_prep_national[df_holidays_prep_national["store_nbr"] == 54]

df_holidays_prep_national_filtered.sort_values(by="date")

df_holidays_prep_national_filtered.head(50)

Unnamed: 0,date,type,locale,locale_name,store_nbr,city,state
53,2012-08-10,Holiday,National,Ecuador,54,El Carmen,Manabi
107,2012-10-09,Holiday,National,Ecuador,54,El Carmen,Manabi
161,2012-10-12,Transfer,National,Ecuador,54,El Carmen,Manabi
215,2012-11-02,Holiday,National,Ecuador,54,El Carmen,Manabi
269,2012-11-03,Holiday,National,Ecuador,54,El Carmen,Manabi
323,2012-12-21,Additional,National,Ecuador,54,El Carmen,Manabi
377,2012-12-22,Additional,National,Ecuador,54,El Carmen,Manabi
431,2012-12-23,Additional,National,Ecuador,54,El Carmen,Manabi
485,2012-12-24,Bridge,National,Ecuador,54,El Carmen,Manabi
539,2012-12-24,Additional,National,Ecuador,54,El Carmen,Manabi


In [32]:
# Sebastiaan code
df_holidays_prep_local = holidays_prep_local(df_holidays, df_stores)
df_holidays_prep_regional = holidays_prep_regional(df_holidays, df_stores)
df_holidays_prep_national = holidays_prep_national(df_holidays, df_stores)

In [33]:
def holidays_prep_merged(df_holidays, df_stores):

    # Load prep functions from local, Regional and National df's
    df_holidays_prep_local = holidays_prep_local(df_holidays, df_stores)

    df_holidays_prep_regional = holidays_prep_regional(df_holidays, df_stores)

    df_holidays_prep_national = holidays_prep_national(df_holidays, df_stores)

    # Combine local, regional and national dataframes into 1 merged dataframe
    df_holidays_merged = pd.concat(
        [df_holidays_prep_local, df_holidays_prep_regional, df_holidays_prep_national]
    )

    # Clean df_holidays_merged by dropping locale_name", "city", "state"
    df_holidays_merged = df_holidays_merged.drop(
        columns=["locale_name", "city", "state"]
    )

    # Rename 'type' of holiday to 'holiday_type'
    df_holidays_merged = df_holidays_merged.rename(
        columns={"type": "holiday_type", "locale": "holiday_locale"}
    )

    return df_holidays_merged

In [34]:
df_testholidays = holidays_prep_merged(df_holidays, df_stores)

df_testholidays.head(3)

Unnamed: 0,date,holiday_type,holiday_locale,store_nbr
0,2012-03-02,Holiday,Local,52
1,2012-03-02,Holiday,Local,53
2,2012-04-12,Holiday,Local,37


## 3X FIX FOR DUPLICATE ROWS FROM DF_HOLIDAYS_MERGED OUTPUT - BY SEBASTIAAN 16082024

Fix contains 2 possible options:  
Option 1: makes from df_holidays_merged a pivot with 3 columns counting for the amount of holidays per locale of holiday (local, regional or national).  
Option 2: makes from df_holidays_merged a pivot following the columns of option 1 but additionally adds the type of holiday (but therefore increasing the amount of columns).  

I was first thinking about adjusting the processing from local, regional en national dataframes and then doing the union by grouping them first but that would still make it possible to have duplicates over multiple 'locales' (you could have a regional and national holiday on the same day). Thus, the way it's been done now is the way to go. After the union we do the trick to fix the duplicate rows for further processing.

In [35]:
# Sebastiaan code - Option 1 to group all dates in df_holidays_merged after the union of the three dataframes and count the number of holidays per date per store

def holidays_prep_merged_grouped(df_holidays, df_stores):

    # Merge the holiday dataframes and clean the merged dataframe
    df_holidays_merged = holidays_prep_merged(df_holidays, df_stores)

    # Group by date and store_nbr and count the number of holidays per date per store
    df_holidays_merged_grouped = df_holidays_merged.pivot_table(
        index=['date', 'store_nbr'],
        columns='holiday_locale',
        values='holiday_type', 
        aggfunc='count'
    ).reset_index()

    # The nature of the pivot function causes it to append date and store_nbrs for all possible combinations and thus not only the date and store combinations that we originally had in our data, we will conduct an inner join with the original data to get the original date and store_nbr combinations back

    # Remove the name of the columns
    df_holidays_merged_grouped.columns.name = None

    # Rename the columns to countoflocalholidays, countofregionalholidays, countofnationalholidays
    df_holidays_merged_grouped = df_holidays_merged_grouped.rename(columns={
        'Local': 'countoflocalholidays', 
        'Regional': 'countofregionalholidays', 
        'National': 'countofnationalholidays'
    })

    # Fill NaN values with 0
    df_holidays_merged_grouped = df_holidays_merged_grouped.fillna(0)

    # Let's do an inner join with the original data to get the original date and store_nbr combinations back. Therefore we need to make another dataframe.

    df_holidays_merged_grouped_inner = holidays_prep_merged(df_holidays, df_stores)
    df_holidays_merged_grouped_inner = df_holidays_merged_grouped_inner.groupby(['date', 'store_nbr']).size().reset_index().drop(columns=0)

    df_holidays_merged_grouped = df_holidays_merged_grouped.merge(df_holidays_merged_grouped_inner, on=['date', 'store_nbr'], how='inner')

    # Convert the count columns to integer
    df_holidays_merged_grouped = df_holidays_merged_grouped.astype({'countoflocalholidays': int, 'countofregionalholidays': int, 'countofnationalholidays': int})

    print(f'In the orignal unioned holiday dataframe, df_holidays_merged we found (including duplicates) {df_holidays_merged.shape[0]} rows')
    print(f'In our new adjusted dataframe we have {df_holidays_merged_grouped.shape[0]} rows')
    print(f'Thus, we have removed {df_holidays_merged.shape[0] - df_holidays_merged_grouped.shape[0]} rows')

    # Might want to filter out the holiday dates that will never be in de salesdate range. However, they will be left out anyway when joining with the sales data.
    return df_holidays_merged_grouped

In [36]:
# Sebastiaan code - See if the function works

df_test_holidays_grouped = holidays_prep_merged_grouped(df_holidays, df_stores)

df_test_holidays_grouped.head(3)

In the orignal unioned holiday dataframe, df_holidays_merged we found (including duplicates) 8276 rows
In our new adjusted dataframe we have 8091 rows
Thus, we have removed 185 rows


  df_holidays_merged_grouped = df_holidays_merged.pivot_table(


Unnamed: 0,date,store_nbr,countoflocalholidays,countofnationalholidays,countofregionalholidays
0,2012-03-02,52,1,0,0
1,2012-03-02,53,1,0,0
2,2012-04-01,12,0,0,1


In [37]:
# Sebastiaan code # To see if the code works as designed we will check it with store 54 and only for the year 2012 (as i know we have holidays on the same day that year and don't want to bother you with to much data)

df_holidays_prep_national54 = holidays_prep_national(df_holidays, df_stores)
df_holidays_prep_national_filtered54 = df_holidays_prep_national54[df_holidays_prep_national54["store_nbr"] == 54]
df_holidays_prep_national_filtered54 = df_holidays_prep_national_filtered54[df_holidays_prep_national_filtered54["date"].dt.year == 2012]
df_holidays_prep_national_filtered54.sort_values(by="date")

print('In the original holiday dataframe for store 54 we found 2 duplicate holidays in 2012, on the 24th and the 31st of December')
df_holidays_prep_national_filtered54.head(50)

In the original holiday dataframe for store 54 we found 2 duplicate holidays in 2012, on the 24th and the 31st of December


Unnamed: 0,date,type,locale,locale_name,store_nbr,city,state
53,2012-08-10,Holiday,National,Ecuador,54,El Carmen,Manabi
107,2012-10-09,Holiday,National,Ecuador,54,El Carmen,Manabi
161,2012-10-12,Transfer,National,Ecuador,54,El Carmen,Manabi
215,2012-11-02,Holiday,National,Ecuador,54,El Carmen,Manabi
269,2012-11-03,Holiday,National,Ecuador,54,El Carmen,Manabi
323,2012-12-21,Additional,National,Ecuador,54,El Carmen,Manabi
377,2012-12-22,Additional,National,Ecuador,54,El Carmen,Manabi
431,2012-12-23,Additional,National,Ecuador,54,El Carmen,Manabi
485,2012-12-24,Bridge,National,Ecuador,54,El Carmen,Manabi
539,2012-12-24,Additional,National,Ecuador,54,El Carmen,Manabi


In [38]:
# Sebastiaan code # This should be fixed in our new dataframe, let's see
df_holidays_merged_grouped54 = holidays_prep_merged_grouped(df_holidays, df_stores)
df_holidays_merged_grouped54 = df_holidays_merged_grouped54[df_holidays_merged_grouped54["store_nbr"] == 54]
df_holidays_merged_grouped54 = df_holidays_merged_grouped54[df_holidays_merged_grouped54["date"].dt.year == 2012]

print('In our new holiday dataframe for store 54 we nicely removed the duplicate rows but do account for multiple holidays on the same day by making use of the count columns')
df_holidays_merged_grouped54.head(50)

In the orignal unioned holiday dataframe, df_holidays_merged we found (including duplicates) 8276 rows
In our new adjusted dataframe we have 8091 rows
Thus, we have removed 185 rows
In our new holiday dataframe for store 54 we nicely removed the duplicate rows but do account for multiple holidays on the same day by making use of the count columns


  df_holidays_merged_grouped = df_holidays_merged.pivot_table(


Unnamed: 0,date,store_nbr,countoflocalholidays,countofnationalholidays,countofregionalholidays
19,2012-07-03,54,1,0,0
75,2012-08-10,54,0,1,0
134,2012-10-09,54,0,1,0
188,2012-10-12,54,0,1,0
242,2012-11-02,54,0,1,0
296,2012-11-03,54,0,1,0
396,2012-12-21,54,0,1,0
450,2012-12-22,54,0,1,0
504,2012-12-23,54,0,1,0
558,2012-12-24,54,0,2,0


In [39]:
# Sebastiaan code - Altered version 23082024
#  Fill newly created NaN columns, due to holiday join, with 'no' on thates where there are now holidays
def holidays_fill_zero_normal(df,df_holidays_merged_grouped):
    """
    Fills the NaN values with 0 for all columns with type "int32" in the combined dataframe.
    It will only fill the columns that are in the original dataframe and not in the holiday dataframe.
    """ 
    columns_to_fill = df_holidays_merged_grouped.columns.intersection(df.columns)
    int32_columns_to_fill = [col for col in columns_to_fill if df_holidays_merged_grouped[col].dtype == 'int32']
    df[int32_columns_to_fill] = df[int32_columns_to_fill].fillna(0).astype('int32')
    
    return df

In [40]:
def filling_dates_cartesian(df_sales):

    # Sales prep
    df_sales_cleaned = sales_cleaned(df_sales)

    # Create new df to include all daily dates in the range, filling missing dates with NaNs
    df = df_sales_cleaned.copy()

    # Print first and last date of df
    print(f'First date in df: {df["date"].min()}')
    print(f'Last date in df:  {df["date"].max()}')
    print("-" * 71)

    # Calculate memory size and shape size of start df
    df_mem_start = sys.getsizeof(df)
    df_shape_start = df.shape[0] / 1e6
    print(
        f"Start size of df:     {round(df_mem_start/1024/1024/1024, 2)} GB and start observations:     {round(df_shape_start, 1)} million."
    )

    # Create a complete date range for the entire dataset, it's a datetimeindex object 
    all_dates = pd.date_range(start=df["date"].min(), end=df["date"].max(), freq="D")

    # Create a multi-index from all possible combinations of 'item_nbr' and 'date'
    all_combinations = pd.MultiIndex.from_product(
        [df["store_nbr"].unique(), df["item_nbr"].unique(), all_dates],
        names=["store_nbr", "item_nbr", "date"],
    )

    print(
        f'The multi-index (all_combinations of store, date and item for the minimum and maximum dates found result in {round(all_combinations.shape[0]/1e6,1)} million rows, this is the amount of rows we expect in the final dataframe.'
    )

    # -----------------------------------------------------------------------------------------------------
    # Check for duplicates in the combination of 'store_nbr', 'item_nbr', and 'date'
    # This method is based on boolean indexing, when there's a true value for the duplicated method, it will return those rows to the duplicate_rows variable
    duplicate_rows = df[
        df.duplicated(subset=["store_nbr", "item_nbr", "date"], keep=False)
    ]
    if not duplicate_rows.empty:
        print(
            "Warning: Duplicate entries found in the combination of 'store_nbr', 'item_nbr', and 'date'."
        )
        print(f"Total dublicate rows {duplicate_rows.shape[0]}")
        print("-" * 71)

    # -----------------------------------------------------------------------------------------------------

    # Reindex the original DataFrame to include all combinations of 'store_nbr', 'item_nbr', and 'date'
    df_reindexed = df.set_index(["store_nbr", "item_nbr", "date"]).reindex(
        all_combinations
    )

    # Reset the index to turn the multi-index back into regular columns
    df_sales_cartesian = df_reindexed.reset_index()

    # Calculate memory size and shape size of final end df
    df_mem_end = sys.getsizeof(df_sales_cartesian)
    df_mem_change_perc = ((df_mem_end - df_mem_start) / df_mem_start) * 100
    df_mem_change = df_mem_end - df_mem_start

    df_shape_end = df_sales_cartesian.shape[0] / 1e6
    df_shape_change_perc = ((df_shape_end - df_shape_start) / df_shape_start) * 100
    df_shape_change = df_shape_end - df_shape_start

    print(
        f"Final size of df:     {round(df_mem_end/1024/1024/1024, 2)} GB and end observations:       {round(df_shape_end, 1)} million."
    )
    print(
        f"Change in size of df: {round(df_mem_change_perc, 2)} % and observations:           {round(df_shape_change_perc, 2)}     %."
    )
    print(
        f"Increased size of df: {round(df_mem_change/1024/1024/1024, 2)} GB and increased observations: {round(df_shape_change, 1)} million."
    )

    print("-" * 71)

    return df_sales_cartesian

3.3 Merge datasets

In [41]:
# Merge datasets
def merge_datasets(df_sales, df_items, df_stores, df_holidays):
    
    # Sales prep   
    print("Step 1 - Cleaning sales data and making a cartesian product of the sales data and the minimum and maximum dates found in the data.")
    df_sales_cartesian = filling_dates_cartesian(df_sales)
    print("-" * 71)

    # Holidays prep
    print("Step 2 - Cleaning holiday data and counting the number of holidays per date per store for each type of holiday (national, regional, local).")
    df_holidays_merged_grouped = holidays_prep_merged_grouped(df_holidays, df_stores)
    print("-" * 71)

    # Stores prep
    print("Step 3 - Cleaning stores data (read: dropping unnecessary columns and renaming columns for clarity).")    
    df_stores = stores_cleaned_renamed(df_stores)
    print("-" * 71)

    # Items prep
    print("Step 4 - Cleaning items data (read: dropping unnecessary columns and renaming columns for clarity).")  
    df_items = items_cleaned_renamed(df_items)
    print("-" * 71)

    # Holidays merge on sales
    print("Step 5 - Adding holiday data to our cartesian product of sales data (with store, item and date combinations) and cleaning up null values for count of holiday columns.")  
    df_merged = df_sales_cartesian.merge(df_holidays_merged_grouped, on=["date", "store_nbr"], how="left")
    df_merged = holidays_fill_zero_normal(df_merged,df_holidays_merged_grouped)
    print("-" * 71)

    # Stores merged with sales+holidays
    print("Step 6 - Adding holiday data to our cartesian product of sales data (with store, item and date combinations) and cleaning up null values for count of holiday columns.")      
    df_merged = df_merged.merge(df_stores, on="store_nbr", how="left")
    print("-" * 71)

    
    # -------------------------------------------------------------------
    # To-do: Check if problem is in dtype of item_nbr --> in df_merged or in df_items -> 
    # Sebastiaan 26082024 -> Remove in next version update, put the transition to int32 in the cleaning function for sales and items and remove it here.

    # print(df_merged["item_nbr"].dtype)
    # print(df_items["item_nbr"].dtype)

    # # Change the dtype for item_nbr from uint32 to int32
    df_merged["item_nbr"] = df_merged["item_nbr"].astype(int)
    df_items["item_nbr"] = df_items["item_nbr"].astype(int)
    # print("-" * 30)
    # print(df_merged["item_nbr"].dtype)
    # print(df_items["item_nbr"].dtype)
    # print("-" * 30)

    # -------------------------------------------------------------------

    # Items merged with sales+holidays+stores
    print("Step 7 - Adding items data to our cartesian product of sales data (with store, item and date combinations) and cleaning up null values for count of holiday columns. Remember, in our last step we added a lot of store information as well")   
    df_final = df_merged.merge(df_items, on="item_nbr", how="left")
    print("-" * 71)

    # Print some referential integrity checks to make sure we have the same amount of rows
    print(f'The amount of rows in the sales dataframe was {df_sales.shape[0]}')
    print(f'After making a cartesian product with date, store and item we had a total of {df_sales_cartesian.shape[0]} rows')
    print(f'After mergin with the holidays, stores, and items we have {df_final.shape[0]} rows')
    print(f'The difference between the incoming and outgoing data from this function is {df_sales.shape[0] - df_final.shape[0]} rows')
    print(f'If we compare the outgoing dataframe called "df_final" with the cartesian product of sales data and dates we see that the difference is {df_sales_cartesian.shape[0] - df_final.shape[0]} rows')
    print(f'If the difference is 0, we have a perfect match and we can continue with the next steps.')

    return df_final

In [42]:
df_final = merge_datasets(df_sales, df_items, df_stores, df_holidays)  # --> 2.44 GB

Step 1 - Cleaning sales data and making a cartesian product of the sales data and the minimum and maximum dates found in the data.
First date in df: 2013-01-02 00:00:00
Last date in df:  2017-08-15 00:00:00
-----------------------------------------------------------------------
Start size of df:     2.35 GB and start observations:     93.3 million.
The multi-index (all_combinations of store, date and item for the minimum and maximum dates found result in 245.1 million rows, this is the amount of rows we expect in the final dataframe.
Final size of df:     4.34 GB and end observations:       245.1 million.
Change in size of df: 84.74 % and observations:           162.52     %.
Increased size of df: 1.99 GB and increased observations: 151.7 million.
-----------------------------------------------------------------------
-----------------------------------------------------------------------
Step 2 - Cleaning holiday data and counting the number of holidays per date per store for each typ

  df_holidays_merged_grouped = df_holidays_merged.pivot_table(


-----------------------------------------------------------------------
Step 6 - Adding holiday data to our cartesian product of sales data (with store, item and date combinations) and cleaning up null values for count of holiday columns.
-----------------------------------------------------------------------
Step 7 - Adding items data to our cartesian product of sales data (with store, item and date combinations) and cleaning up null values for count of holiday columns. Remember, in our last step we added a lot of store information as well
-----------------------------------------------------------------------
The amount of rows in the sales dataframe was 93346220
After making a cartesian product with date, store and item we had a total of 245053620 rows
After mergin with the holidays, stores, and items we have 245053620 rows
The difference between the incoming and outgoing data from this function is -151707400 rows
If we compare the outgoing dataframe called "df_final" with the carte

In [43]:
# Sebastiaan code -
# What about the "onpromotion" column, seems that it has a lot of NaN values. Are these quality issues or is just that there's no promotion. 
# This issue didn't arrive after merging, it was there from the beginning (in the df_sales dataframe).
# You would expect that if there's no promotion going on the value to be "False"
df_sales1 = sales_cleaned(df_sales)

df_sales1_unique = df_sales1['onpromotion'].unique()

# 4 Data Manipulation

4.X Brainstorm ideas for imputing missing values


4.1.1 Create a date for all missing values/dates and keep the value of sales as NA

Action: Create all daily dates in the date range. Date range starts from first available date in df to last available date in df. Then filling missing dates with NaNs for per unique item per unique store


Troubleshooting of filling_dates_NaN function

In [44]:
df_final.info()
# Count nulls per column
null_counts = df_final.isnull().sum()

# Print results
for column, count in null_counts.items():
    print(f"Column '{column}' has {count} null values.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245053620 entries, 0 to 245053619
Data columns (total 14 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   store_nbr                uint8         
 1   item_nbr                 int32         
 2   date                     datetime64[ns]
 3   unit_sales               float32       
 4   onpromotion              boolean       
 5   countoflocalholidays     int32         
 6   countofnationalholidays  int32         
 7   countofregionalholidays  int32         
 8   store_city               category      
 9   store_type               category      
 10  store_cluster            uint8         
 11  item_family              category      
 12  item_class               uint16        
 13  perishable               uint8         
dtypes: boolean(1), category(3), datetime64[ns](1), float32(1), int32(4), uint16(1), uint8(3)
memory usage: 8.7 GB
Column 'store_nbr' has 0 null values.
Column 'it

In [45]:
df_final_store_54_item_129296 = df_final[(df_final["store_nbr"] == 54) & (df_final["item_nbr"] == 129296)]

df_final_store_54_item_129296.tail(50)

Unnamed: 0,store_nbr,item_nbr,date,unit_sales,onpromotion,countoflocalholidays,countofnationalholidays,countofregionalholidays,store_city,store_type,store_cluster,item_family,item_class,perishable
238329188,54,129296,2017-06-27,,,0,0,0,El Carmen,C,3,GROCERY I,1032,0
238329189,54,129296,2017-06-28,1.0,True,0,0,0,El Carmen,C,3,GROCERY I,1032,0
238329190,54,129296,2017-06-29,3.0,True,0,0,0,El Carmen,C,3,GROCERY I,1032,0
238329191,54,129296,2017-06-30,1.0,True,0,0,0,El Carmen,C,3,GROCERY I,1032,0
238329192,54,129296,2017-07-01,3.0,True,0,0,0,El Carmen,C,3,GROCERY I,1032,0
238329193,54,129296,2017-07-02,3.0,True,0,0,0,El Carmen,C,3,GROCERY I,1032,0
238329194,54,129296,2017-07-03,2.0,True,1,0,0,El Carmen,C,3,GROCERY I,1032,0
238329195,54,129296,2017-07-04,4.0,True,0,0,0,El Carmen,C,3,GROCERY I,1032,0
238329196,54,129296,2017-07-05,1.0,True,0,0,0,El Carmen,C,3,GROCERY I,1032,0
238329197,54,129296,2017-07-06,1.0,False,0,0,0,El Carmen,C,3,GROCERY I,1032,0


In [46]:
# This is very memory costly!!! --> will result in huge df
df_merged_full_nan = filling_dates_NaN(df_merged)

NameError: name 'filling_dates_NaN' is not defined

4.1.2 Fill newly created dates for non-sales columns using forward fill and backward fill --> items, stores, holidays columns

In [None]:
# Fill missing values for non-sales columns using forward fill and backward fill --> items, stores, holidays
non_sales_columns = [
    "store_city",
    "store_type",
    "store_cluster",
    "item_family",
    "item_class",
    "perishable",
]

# To-do: test more in individual item level how this works
df_merged_full_nan[non_sales_columns] = df_merged_full_nan.groupby(
    ["item_nbr", "store_nbr"]
)[non_sales_columns].transform(lambda group: group.ffill().bfill())

In [None]:
df_final_store_54_item_129296 = df_merged_full_nan[(df_merged_full_nan["store_nbr"] == 54) & (df_merged_full_nan["item_nbr"] == 129296)]

df_final_store_54_item_129296.tail(50)

Unnamed: 0,store_nbr,item_nbr,date,unit_sales,onpromotion,countoflocalholidays,countofnationalholidays,countofregionalholidays,store_city,store_type,store_cluster,item_family,item_class,perishable
238329188,54,129296,2017-06-27,,,,,,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329189,54,129296,2017-06-28,1.0,True,0.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329190,54,129296,2017-06-29,3.0,True,0.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329191,54,129296,2017-06-30,1.0,True,0.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329192,54,129296,2017-07-01,3.0,True,0.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329193,54,129296,2017-07-02,3.0,True,0.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329194,54,129296,2017-07-03,2.0,True,1.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329195,54,129296,2017-07-04,4.0,True,0.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329196,54,129296,2017-07-05,1.0,True,0.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0
238329197,54,129296,2017-07-06,1.0,False,0.0,0.0,0.0,El Carmen,C,3.0,GROCERY I,1032.0,0.0


4.2: Detect negative values

•	Action: Delete unit_sales if values are lower than zero --> N/A

In [None]:
def negative_sales_cleaned(df):

    # Check the number of negative values before replacement
    before_replacement = (df["unit_sales"] < 0).sum()
    print(f"Number of negative values before replacement: {before_replacement}")

    # Create a boolean mask for the negative sales rows to create a 'boolean flag-list' containing all negative rows, used to filter full df_sales df
    negative_sales_mask = df["unit_sales"] < 0

    # Use the mask to update the flagged 'unit_sales' column in the original DataFrame
    df.loc[negative_sales_mask, "unit_sales"] = df.loc[
        negative_sales_mask, "unit_sales"
    ].where(df.loc[negative_sales_mask, "unit_sales"] >= 0, np.nan)

    # Check the number of negative values after replacement
    after_replacement = (df["unit_sales"] < 0).sum()
    print(f"Number of negative values after replacement: {after_replacement}")

    return df

In [None]:
# Full merged df_merged_full

# df_sales_nan = negative_sales_cleaned(df_merged_full)

In [None]:
# Check function with df_sales
df_sales_nan = negative_sales_cleaned(df_sales)

df_sales_nan_check = df_sales_nan[df_sales_nan["unit_sales"].isna()].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)

df_sales_nan_check.head(30)

Number of negative values before replacement: 5806
Number of negative values after replacement: 0


Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date
10655,10655,10,456875,,,2,2013,1,2013-01-02
46867,46867,5,559044,,,3,2013,1,2013-01-03
50970,50970,9,365138,,,3,2013,1,2013-01-03
71807,71807,41,812716,,,3,2013,1,2013-01-03
71992,71992,41,1004551,,,3,2013,1,2013-01-03
75255,75255,46,208530,,,3,2013,1,2013-01-03
91163,91163,9,457574,,,4,2013,1,2013-01-04
106658,106658,34,586824,,,4,2013,1,2013-01-04
111811,111811,41,956012,,,4,2013,1,2013-01-04
111813,111813,41,956014,,,4,2013,1,2013-01-04


4.3 Define new, old and closed stores

•	Condition: sales for all items a given store and date are NA

•	Action: Impute with 0


In [None]:
# Sum/Agg all sales group  by store, date
# --> Sum_sales > 0 then store_opened
# else --> closed --> inputed with 0


# TO-do: discuss about closed_store --> inpute with 0 or N/A?

4.4 New product

•	Before the very first sale of an item, all observations are kept as NA

•	After the very first sale of an item, we go to step 3:  


In [None]:
# Sum/Agg all sales group  by item, date
# --> Sum_sales > 0 then first_sales_day of product
# else <first_sales_day of product --> delete unit_sales --> N/A

4.8  Stockout on store level

•      Perishable good: when there are missing values for two consecutive days for a given item per individual store 

•      Nonperishable goods: when there are missing values for 7 consecutive days for a given item and per individual store

•      Action: Impute with algorithm 


In [None]:
#perishable good
if ['perishable'] == 1 and item_missing_count > 2 #-->  inpute with 0?

if ['perishable'] == 1 and item_missing_count <= 2 #-->  inpute with mean? or intrepolate?




    
#non-perishable good  
if ['perishable'] == 0 and and item_missing_count > 7 #-->  inpute with 0?

if ['perishable'] == 1 and item_missing_count <= 7 #-->  inpute with mean? or intrepolate?

SyntaxError: expected ':' (3791032638.py, line 2)

In [None]:
# Interpolate between missing datapoints --> sales

fillna(method="mean")

df["column_name"].interpolate(method="linear", inplace=True)

df["column_name"].interpolate(method="time", inplace=True)

df["column_name"].interpolate(method="polynomial", order=2, inplace=True)

# Interpolate missing values for the 'unit_sales' column
df["unit_sales"] = df.groupby(["store_nbr", "item_nbr"])["unit_sales"].apply(
    lambda group: group.interpolate(method="linear")
)

4.4 Missing sales data: Zero sales

•	All other cases

•	Action: Impute with 0

4.X Negative values imputing to 0

4.5 Promotional Data 

•   All missing values are interpreted a day with no promotion

•   Action: Inpute onpromotion N/A with False

In [None]:
# Fill missing N/A values in boolean columns with False
def sales_fill_onpromotion(df):

    df["onpromotion"] = df["onpromotion"].fillna(False)

    return df


# To-do: when perform this function? Before filling_dates_NaN?

In [None]:
# df_merged_test = sales_fill_onpromotion(df_merged)

# df_merged_test.head(10)

# 5 Feature construction

5.X Extracting datetime features

In [None]:
def extract_datetime_features(df):
    """
    Extracting datetime features
    year, month, day of month, weekday (1-7), week number-year, week_year_date
    """
    df = df.copy()

    # Ensure the date column is sorted
    df = df.sort_values("date")

    # df["year"] = df["date"].dt.year
    # df["month"] = df["date"].dt.month
    # df["day"] = df["date"].dt.day

    # Adjusting weekday to start from 1 (Monday) to 7 (Sunday)
    df["weekday"] = df["date"].dt.dayofweek + 1

    # Adding week number-year feature
    df["week_number"] = df["date"].dt.isocalendar().week
    df["week_year"] = df["week_number"].astype(str).str.zfill(2) + df["year"].astype(
        str
    )

    # Convert week_year to datetime with monday as startdate of week
    df["week_year_date"] = pd.to_datetime(
        df["year"].astype(str) + df["week_number"].astype(str).str.zfill(2) + "1",
        format="%Y%W%w",
    )

    # Adding trend feature: number of weeks since the start of the dataset
    start_date = df["date"].min()
    df["weeks_since_start"] = ((df["date"] - start_date).dt.days / 7).astype(int)

    return df

In [None]:
def extract_datetime_features(df):
    """
    Extracting datetime features:
    year, month, day of month, weekday (1-7), week number-year, and trend (weeks since start, starting at 1)
    """
    # Ensure the date column is sorted
    df = df.copy().sort_values("date")

    # Use isocalendar for consistent week-based calculations

    iso_calendar = df["date"].dt.isocalendar()

    # Year, Month, Day
    # df["year"] = iso_calendar.year
    # df["month"] = df["date"].dt.month
    # df["day"] = df["date"].dt.day

    # Weekday (1 = Monday, 7 = Sunday)
    df["weekday"] = iso_calendar.day

    # Week number
    df["week_number"] = iso_calendar.week

    # Week-year
    df["week_year"] = df["week_number"].astype(str).str.zfill(2) + df["year"].astype(
        str
    )

    # Convert week_year to datetime with monday as startdate of week
    df["week_year_date"] = pd.to_datetime(
        df["year"].astype(str) + df["week_number"].astype(str).str.zfill(2) + "1",
        format="%Y%W%w",
    )

    # First day of the ISO year containing the start date

    start_date = df["date"].min()
    start_year_first_day = datetime(start_date.year, 1, 1)

    # 'search' for first monday of year
    while start_year_first_day.isocalendar()[1] != 1:

        start_year_first_day = start_year_first_day + pd.Timedelta(days=1)

    ##Itemweek number
    # Weeks since start (aligned with ISO week numbers)
    df["weeks_since_start"] = (
        iso_calendar.week + (iso_calendar.year - start_year_first_day.year) * 52
    )

    # Adjust weeks_since_start to start from 1
    df["weeks_since_start"] = (
        df["weeks_since_start"] - df["weeks_since_start"].min() + 1
    )

    return df

In [None]:
# df = extract_datetime_features(df)

5.X Promotion

The number of days a item was on promotion 

In [None]:
# COPY FROM OLD NOTEBOOK
# TO-DO 1: transform with new df names
# TO-DO 2: total promotion days month --> week


def onpromotion_month_count(df):

    if "onpromotion" in df.columns:

        df["onpromotion_month_count"] = df.groupby(
            ["item_nbr", "store_nbr", "day", "month", "year"]
        )["onpromotion"].transform("sum")

        print(
            "Change: 'onpromotion' column transformed to 'onpromotion_month_count' feature."
        )
    else:

        print("The DataFrame does not contain an 'onpromotion' column.")

    return df

In [None]:
df_0_agg = (
    onpromotion_month_count(df_0)  # Transformation to 'onpromotion_month_count' feature
    .drop(
        columns=["id", "date", "onpromotion"]
    )  # Drop unnecessary columns "id", "date", "onpromotion"
    .groupby(["month", "year", "store_nbr", "item_nbr"])
    .agg({"unit_sales": "sum", "onpromotion_month_count": "sum"})
    .reset_index()
)

5.X Store closed on 25-12 and 01-01 

STore closed in between when inputed with 0

--> can we also use this feature to include the excluded stores with >9 days data, due to closing or later openings?