### Supermarket data science case study - Exploring first data


### >> Importing packages

In [1]:
import pandas as pd
import numpy as np

import sys
import altair as alt

import vegafusion as vf

import sklearn

from datetime import datetime
from sklearn.pipeline import Pipeline, make_pipeline

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

### >> Functions

In [None]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)

    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len - 1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len - 1]

    # Return result.
    return dummy

In [None]:
def f_describe(df_input, n_top=10):

    print("First " + str(n_top) + " rows in de data:")
    display(df_input.head(n_top))

    df_numeric = df_input.select_dtypes(
        include=[
            "uint8",
            "uint16",
            "uint32",
            "uint64",
            "int8",
            "int16",
            "int32",
            "int64",
            "float16",
            "float32",
            "float64",
        ]
    )

    if len(df_numeric.columns):
        print("Numerical data:")
        display(df_numeric.describe())

    df_textual = df_input.select_dtypes(include=["category", "object", "bool"])

    if len(df_textual.columns):
        print("Textual data:")
        display(df_textual.describe())

    v_na = [
        col
        + " ("
        + str(df[col].isna().sum())
        + ", "
        + str(round(100 * df[col].isna().sum() / df.shape[0], 1))
        + "%)"
        for col in df.columns
        if df[col].isna().sum() > 0
    ]

    if len(v_na) > 0:
        print("Features and their number of missing values:")
        display(f_concat(v_na))

In [2]:
def extract_datetime_features(df):
    """
    Extracting datetime features
    year, month, day of month, weekday (1-7), week number-year, week_year_date
    """
    df = df.copy()

    # Ensure the date column is sorted
    df = df.sort_values("date")

    # df["year"] = df["date"].dt.year
    # df["month"] = df["date"].dt.month
    # df["day"] = df["date"].dt.day

    # Adjusting weekday to start from 1 (Monday) to 7 (Sunday)
    df["weekday"] = df["date"].dt.dayofweek + 1

    # Adding week number feature
    df["week_number"] = df["date"].dt.isocalendar().week

    # Adding week number-year feature
    df["week_year"] = df["week_number"].astype(str).str.zfill(2) + df["year"].astype(
        str
    )

    # Convert week_year to datetime with monday as startdate of week
    df["week_year_date"] = pd.to_datetime(
        df["year"].astype(str) + df["week_number"].astype(str).str.zfill(2) + "1",
        format="%Y%W%w",
    )

    # Adding trend feature: number of weeks since the start of the dataset
    start_date = df["date"].min()

    df["weeks_since_start"] = ((df["date"] - start_date).dt.days / 7).astype(int) + 1

    return df

In [17]:
def extract_datetime_features(df):
    """
    Extracting datetime features:
    year, month, day of month, weekday (1-7), week number-year, and trend (weeks since start, starting at 1)
    """
    # Ensure the date column is sorted
    df = df.copy().sort_values("date")

    # Use isocalendar for consistent week-based calculations

    iso_calendar = df["date"].dt.isocalendar()

    # Year, Month, Day
    # df["year"] = iso_calendar.year
    # df["month"] = df["date"].dt.month
    # df["day"] = df["date"].dt.day

    # Weekday (1 = Monday, 7 = Sunday)
    df["weekday"] = iso_calendar.day


    # Week number
    df["week_number"] = iso_calendar.week

    # Week-year
    df["week_year"] = df["week_number"].astype(str).str.zfill(2) + df["year"].astype(
        str
    )

    # Convert week_year to datetime with monday as startdate of week
    df["week_year_date"] = pd.to_datetime(
        df["year"].astype(str) + df["week_number"].astype(str).str.zfill(2) + "1",
        format="%Y%W%w",
    )

    # First day of the ISO year containing the start date

    start_date = df["date"].min()
    start_year_first_day = datetime(start_date.year, 1, 1)

    # 'search' for first monday of year
    while start_year_first_day.isocalendar()[1] != 1:

        start_year_first_day = start_year_first_day + pd.Timedelta(days=1)

    ##Itemweek number
    # Weeks since start (aligned with ISO week numbers)
    df["weeks_since_start"] = (
        iso_calendar.week + (iso_calendar.year - start_year_first_day.year) * 52
    )

    # Adjust weeks_since_start to start from 1
    df["weeks_since_start"] = (
        df["weeks_since_start"] - df["weeks_since_start"].min() + 1
    )

    return df

In [25]:
df_0 = extract_datetime_features(df_0)

df_0.info()

NameError: name 'extract_datetime_features' is not defined

In [23]:
df_0.sample(10)

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date
54352484,45870764,49,229368,7.0,False,1,2015,5,2015-05-01
27313701,20792848,34,871511,42.0,,18,2014,3,2014-03-18
55010902,46529182,50,368213,3.0,False,11,2015,5,2015-05-11
33859383,27338530,27,1489660,4.356,False,17,2014,7,2014-07-17
54943542,46461822,50,326951,9.0,False,10,2015,5,2015-05-10
29101777,22580924,27,262991,10.0,False,19,2014,4,2014-04-19
24785462,18264609,6,1160872,13.0,,1,2014,2,2014-02-01
32485491,25964638,51,866927,3.0,False,25,2014,6,2014-06-25
47742689,65368135,46,114790,59.0,False,19,2015,12,2015-12-19
21598019,35414924,26,308924,1.0,False,17,2014,11,2014-11-17


In [None]:
df_0 = f_get_data(0)
df_0 = df_0[df_0["year"].isin([2014, 2015])]

In [None]:
df_0_36_648313 = df_0[
    (df_0["store_nbr"] == 53)
    & (
        df_0["item_nbr"].isin(
            [
                627887,
                759890,
            ]
        )
    )
].drop(
    columns=[
        "weekday",
        "week_number",
        "week_year",
        "weeks_since_start",
        "weeks_since_start_2",
    ]
)
# 53 759890 --> starts 2014-05-29, next date 2024-05-31

In [16]:
df_0_36_759890 = df_0[(df_0["store_nbr"] == 53) & (df_0["item_nbr"] == 759890)]

In [45]:
df_0_36_3items_3stores = df_0[
    (df_0["store_nbr"].isin([53, 6]))
    & (df_0["item_nbr"].isin([627887, 759890, 1160872]))
].drop(columns=["id"])

In [49]:
df_0_36_3items_3stores.tail(40).sort_values(by=["date", "store_nbr", "item_nbr"])

Unnamed: 0,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date
65526694,6,627887,1.0,False,20,2015,9,2015-09-20
65526804,6,759890,2.0,False,20,2015,9,2015-09-20
65527371,6,1160872,12.0,False,20,2015,9,2015-09-20
65604559,53,627887,1.0,False,20,2015,9,2015-09-20
65604651,53,759890,2.0,False,20,2015,9,2015-09-20
65605110,53,1160872,3.0,False,20,2015,9,2015-09-20
65617336,6,759890,1.0,False,21,2015,9,2015-09-21
65617859,6,1160872,12.0,False,21,2015,9,2015-09-21
65690752,53,1160872,1.0,False,21,2015,9,2015-09-21
65702624,6,759890,4.0,False,22,2015,9,2015-09-22


In [51]:
df_0_36_3items_3stores = filling_dates_NaN(df_0_36_3items_3stores)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [43]:
df_final_check = df_0_36_3items_3stores[
    (df_0_36_3items_3stores["date"] > "2015-09-12")
].sort_values(by=["date", "store_nbr", "item_nbr"])

df_final_check.head(60)

Unnamed: 0,item_nbr,date,store_nbr,unit_sales,onpromotion,day,year,month
620,627887,2015-09-13,53.0,1.0,False,13.0,2015.0,9.0
1368,759890,2015-09-13,,,,,,
2116,1160872,2015-09-13,53.0,9.0,False,13.0,2015.0,9.0
621,627887,2015-09-14,,,,,,
1369,759890,2015-09-14,,,,,,
2117,1160872,2015-09-14,53.0,7.0,False,14.0,2015.0,9.0
622,627887,2015-09-15,,,,,,
1370,759890,2015-09-15,53.0,1.0,False,15.0,2015.0,9.0
2118,1160872,2015-09-15,53.0,9.0,False,15.0,2015.0,9.0
623,627887,2015-09-16,,,,,,


In [17]:
df_0_36_759890.head(10)

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date
18327131,32144036,53,759890,2.0,False,1,2014,10,2014-10-01
18397946,32214851,53,759890,2.0,False,2,2014,10,2014-10-02
18468861,32285766,53,759890,1.0,False,3,2014,10,2014-10-03
18543842,32360747,53,759890,1.0,False,4,2014,10,2014-10-04
18687325,32504230,53,759890,1.0,False,6,2014,10,2014-10-06
18755971,32572876,53,759890,1.0,False,7,2014,10,2014-10-07
18962864,32779769,53,759890,1.0,False,10,2014,10,2014-10-10
19107950,32924855,53,759890,1.0,False,12,2014,10,2014-10-12
19244557,33061462,53,759890,2.0,False,14,2014,10,2014-10-14
19381380,33198285,53,759890,1.0,False,16,2014,10,2014-10-16


In [18]:
df_0_36_648313.tail(10)

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date
64822633,56340913,53,759890,2.0,False,11,2015,9,2015-09-11
64913288,56431568,53,759890,3.0,False,12,2015,9,2015-09-12
65171744,56690024,53,759890,1.0,False,15,2015,9,2015-09-15
65256770,56775050,53,759890,1.0,False,16,2015,9,2015-09-16
65339216,56857496,53,759890,1.0,False,17,2015,9,2015-09-17
65422664,56940944,53,759890,1.0,False,18,2015,9,2015-09-18
65604651,57122931,53,759890,2.0,False,20,2015,9,2015-09-20
65938725,57457005,53,759890,1.0,False,24,2015,9,2015-09-24
66022061,57540341,53,759890,2.0,False,25,2015,9,2015-09-25
66456745,57975025,53,759890,2.0,False,30,2015,9,2015-09-30


In [50]:
def filling_dates_NaN(df):
    # Create new df to include all daily dates in the range, filling missing dates with NaNs
    df = df.copy()

    # Create a complete date range for the entire dataset
    all_dates = pd.date_range(start="2014-01-01", end="2016-01-18", freq="D")

    # Create a multi-index from all possible combinations of 'item_nbr' and 'date'
    all_combinations = pd.MultiIndex.from_product(
        df["store_nbr"].unique()[df["item_nbr"].unique(), all_dates],
        names=["item_nbr", "date"],
    )

    # Reindex the original DataFrame to include all combinations of 'item_nbr' and 'date'
    df_reindexed = df.set_index(["store_nbr", "item_nbr", "date"]).reindex(
        all_combinations
    )

    # Reset the index to turn the multi-index back into regular columns
    df_final = df_reindexed.reset_index()

    df_final.info()

    return df_final

In [31]:
df_final_check = df_final[(df_final["date"] > "2014-05-01")]

df_final_check.head(60)

Unnamed: 0,item_nbr,date,id,store_nbr,unit_sales,onpromotion,day,year,month
121,759890,2014-05-02,,,,,,,
122,759890,2014-05-03,,,,,,,
123,759890,2014-05-04,,,,,,,
124,759890,2014-05-05,,,,,,,
125,759890,2014-05-06,,,,,,,
126,759890,2014-05-07,,,,,,,
127,759890,2014-05-08,,,,,,,
128,759890,2014-05-09,,,,,,,
129,759890,2014-05-10,,,,,,,
130,759890,2014-05-11,,,,,,,


In [12]:
df_final.head()

Unnamed: 0,item_nbr,date,id,store_nbr,unit_sales,onpromotion,day,year,month
0,759890,2014-01-01,,,,,,,
1,759890,2014-01-02,,,,,,,
2,759890,2014-01-03,,,,,,,
3,759890,2014-01-04,,,,,,,
4,759890,2014-01-05,,,,,,,


In [13]:
df_final.tail()

Unnamed: 0,item_nbr,date,id,store_nbr,unit_sales,onpromotion,day,year,month
743,759890,2016-01-14,,,,,,,
744,759890,2016-01-15,,,,,,,
745,759890,2016-01-16,,,,,,,
746,759890,2016-01-17,,,,,,,
747,759890,2016-01-18,,,,,,,


In [14]:
df_final.sample(30)

Unnamed: 0,item_nbr,date,id,store_nbr,unit_sales,onpromotion,day,year,month
323,759890,2014-11-20,,,,,,,
603,759890,2015-08-27,,,,,,,
636,759890,2015-09-29,,,,,,,
198,759890,2014-07-18,27436841.0,53.0,2.0,False,18.0,2014.0,7.0
453,759890,2015-03-30,43889452.0,53.0,3.0,False,30.0,2015.0,3.0
249,759890,2014-09-07,30480636.0,53.0,2.0,False,7.0,2014.0,9.0
460,759890,2015-04-06,,,,,,,
722,759890,2015-12-24,65878007.0,53.0,3.0,False,24.0,2015.0,12.0
512,759890,2015-05-28,,,,,,,
258,759890,2014-09-16,,,,,,,


In [None]:
# -----------------------------------------------------------

df = df_0_36_648313.copy()

# Create a complete date range for the entire dataset
all_dates = pd.date_range(start="2014-01-01", end="2016-01-18", freq="D")

# Create a multi-index from all possible combinations of 'store_nbr', 'item_nbr', and 'date'
all_combinations = pd.MultiIndex.from_product(
    [df["store_nbr"].unique(), df["item_nbr"].unique(), all_dates],
    names=["store_nbr", "item_nbr", "date"],
)

# Reindex the original DataFrame to include all combinations of 'store_nbr', 'item_nbr', and 'date'
df_reindexed = df.set_index(["store_nbr", "item_nbr", "date"]).reindex(all_combinations)

# Reset the index to turn the multi-index back into regular columns
df_final = df_reindexed.reset_index()

In [33]:
df_final.head(10)

Unnamed: 0,store_nbr,item_nbr,date,id,unit_sales,onpromotion,day,year,month
0,53,759890,2014-05-29,24574047.0,2.0,False,29.0,2014.0,5.0
1,53,759890,2014-05-30,,,,,,
2,53,759890,2014-05-31,24679208.0,3.0,False,31.0,2014.0,5.0
3,53,759890,2014-06-01,24734687.0,4.0,False,1.0,2014.0,6.0
4,53,759890,2014-06-02,24787413.0,2.0,False,2.0,2014.0,6.0
5,53,759890,2014-06-03,24838853.0,1.0,False,3.0,2014.0,6.0
6,53,759890,2014-06-04,24889969.0,2.0,False,4.0,2014.0,6.0
7,53,759890,2014-06-05,,,,,,
8,53,759890,2014-06-06,,,,,,
9,53,759890,2014-06-07,25044631.0,3.0,False,7.0,2014.0,6.0


In [None]:
df_final.tail(10)

In [None]:
df_final.sample(10)

In [None]:
# Fill missing values for non-sales columns using forward fill and backward fill
non_sales_columns = [
    "onpromotion",
    "day",
    "year",
    "month",
    "item_family",
    "item_class",
    "store_cluster",
]
df[non_sales_columns] = df.groupby(["store_nbr", "item_nbr"])[non_sales_columns].apply(
    lambda group: group.ffill().bfill()
)

# Interpolate missing values for the 'unit_sales' column
df["unit_sales"] = df.groupby(["store_nbr", "item_nbr"])["unit_sales"].apply(
    lambda group: group.interpolate(method="linear")
)

In [20]:
df_0.sample(30)

Unnamed: 0,id,store_nbr,item_nbr,unit_sales,onpromotion,day,year,month,date,weekday,week_number,week_year,weeks_since_start,weeks_since_start_2
56191838,47710118,30,1447815,3.0,False,29,2015,5,2015-05-29,5,22,222015,74,73
56676480,48194760,41,1457341,5.0,False,4,2015,6,2015-06-04,4,23,232015,75,74
38964817,38964817,8,1146974,4.0,False,8,2015,1,2015-01-08,4,2,22015,54,53
24249948,38066853,53,759890,2.0,False,23,2014,12,2014-12-23,2,52,522014,52,50
44800966,62426412,18,507457,1.0,False,18,2015,11,2015-11-18,3,47,472015,99,98
42565951,60191397,3,979199,4.0,False,25,2015,10,2015-10-25,7,43,432015,95,94
42947126,60572572,16,949298,8.0,False,29,2015,10,2015-10-29,4,44,442015,96,95
39332132,39332132,28,847863,19.0,False,14,2015,1,2015-01-14,3,3,32015,55,54
36443914,29923061,48,812751,15.0,False,30,2014,8,2014-08-30,6,35,352014,35,34
39637700,39637700,40,1152355,4.0,False,19,2015,1,2015-01-19,1,4,42015,56,54


In [None]:
def get_unique(df, column_name):
    """Get the all values and the count for specific column"""
    unique_values_count = df[column_name].nunique()
    unique_values = df[column_name].unique()

    # Convert unique values to a single string to print
    unique_values_str = ", ".join(map(str, unique_values))

    print(f"Number of unique values in {column_name}: {unique_values_count}")
    print("Unique values:")
    print(unique_values_str)

    return

### >> Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [2]:
def standardize_column_names(s):
    return s.replace(" ", "")


def optimize_memory(df):
    # Change: Objects to Categorical.
    object_cols = df.select_dtypes(include="object").columns

    if not object_cols.empty:
        print("Change: Objects to Categorical")
        df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        print("Change: " + old + " --> " + new)
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df


def month_year_to_int(df, i):
    # Change: Month and Year to integer.
    if i == 0:
        print("Change: Month and Year to integer")
        df = df.astype({"month": int, "year": int})

    return df

### Transform date-related columns to datetime format.

In [3]:
# Convert datasets to time series
def transform_date_to_datetime(df, i):
    if i == 0:
        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")
        df["date"] = pd.to_datetime(df[["year", "month", "day"]], unit="us")

        # print(
        #     "Change: Dropped 'year', 'month', 'day' columns and transformed to Datetime64[us] feature"
        # )
        # df.drop(columns=["day", "month", "year"], inplace=True)

    else:
        if "date" in df.columns:
            print("Change: Transformed 'date' column to Datetime Dtype")
            df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None)

    return df

### Import data from local PATH
Import data trough pipeline to downcast the data and transformations

In [4]:
def f_get_data(i=0):

    # Define path.
    c_path = "C:/Users/alexander/Documents/0. Data Science and AI for Experts/EAISI_4B_Supermarket/data/raw/"

    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "history_aggregated",  # 1
        "holidays_events",  # 2
        "items",  # 3
        "oil",  # 4
        "stores",  # 5
        "transactions",  # 6
    )

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(month_year_to_int, i)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### >> Importing data

In [5]:
# Sales History per year
df_0 = f_get_data(0)

Change: integer --> unsigned
Change: float --> float
Change: Month and Year to integer
Change: Transformed 'year', 'month', 'day' columns to Datetime feature


In [6]:
df_0 = df_0[df_0["year"].isin([2014, 2015])]

In [None]:
# Holidays
# df_2 = f_get_data(2)

In [None]:
# Items
# df_3 = f_get_data(3)

In [None]:
# Stores
df_5 = f_get_data(5)

### Aggregation of Sales per Month and join of df_0 History sales + df_3 Items + df_5 Stores Datasets

In [None]:
def onpromotion_month_count(df):

    if "onpromotion" in df.columns:

        df["onpromotion_month_count"] = df.groupby(
            ["item_nbr", "store_nbr", "day", "month", "year"]
        )["onpromotion"].transform("sum")

        print(
            "Change: 'onpromotion' column transformed to 'onpromotion_month_count' feature."
        )
    else:

        print("The DataFrame does not contain an 'onpromotion' column.")

    return df

In [None]:
df_0_agg = (
    onpromotion_month_count(df_0)  # Transformation to 'onpromotion_month_count' feature
    .drop(
        columns=["id", "date", "onpromotion"]
    )  # Drop unnecessary columns "id", "date", "onpromotion"
    .groupby(["month", "year", "store_nbr", "item_nbr"])
    .agg({"unit_sales": "sum", "onpromotion_month_count": "sum"})
    .reset_index()
)

In [None]:
df_0_agg.head(20)

In [None]:
df_0_agg.sample(20)

In [None]:
# Merge of df_0_agg with df_3 and df_5

df_03_join = df_0_agg.merge(df_3, on="item_nbr", how="left")

df_035_join = df_03_join.merge(df_5, on="store_nbr", how="left")

In [None]:
print("The transformed and joined data\n")
print(
    f"-> Contains:                {round(df_035_join.shape[0]/1e6, 1)} million observations and {df_035_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_035_join.shape[0]} observations and {df_035_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_035_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_035_join)/1024/1024/1024, 2)} GB."
)

In [None]:
# Sample of final merged dataframe 'df_02_join'

df_035_join.sample(10)

### df_0 History sales join with df_2 Holidays 

In [None]:
print(
    f"-> Contains:                {round(df_2.shape[0]/1e6, 1)} million observations and {df_2.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_2.shape[0]} observations and {df_2.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_2.columns)}.\n")
print(f"-> Has optimized size of    {round(sys.getsizeof(df_2)/1024/1024/1024, 2)} GB.")

df_2.sample(10)

In [None]:
get_unique(df_2, "locale_name")

In [None]:
# drop unnecessary columns 'description'
df_2_cleaned = df_2.drop(columns=["description"])

df_2_cleaned.sample(10)

In [None]:
# Added feature to indicate whether a date is a holiday

df_2_cleaned["is_holiday"] = df_2_cleaned["type"] == "Holiday"
df_2_cleaned.loc[df_2_cleaned["transferred"], "is_holiday"] = (
    False  # Handle transferred holidays
)

In [None]:
# drop unnecessary columns 'id' to save memory
df_0_cleaned = df_0.drop(
    columns=[
        "id",
        "day",
        "year",
        "month",
        "onpromotion",
        # , "onpromotion_month_count"
    ]
)

In [None]:
# Merge of df_0_cleaned with df_2

df_02_join = df_0_cleaned.merge(df_2_cleaned, on="date", '',how="left")

In [None]:
print(
    f"-> Contains:                {round(df_02_join.shape[0]/1e6, 1)} million observations and {df_02_join.shape[1]} features.\n"
)
print(
    f"-> Contains:                {df_02_join.shape[0]} observations and {df_02_join.shape[1]} features.\n"
)
print(f"-> Have feature names:      {f_concat(df_02_join.columns)}.\n")
print(
    f"-> Has optimized size of    {round(sys.getsizeof(df_02_join)/1024/1024/1024, 2)} GB."
)

In [None]:
# Sample of final merged dataframe 'df_02_join'

df_02_join.sample(10)

In [None]:
df_02_join.sample(10)

df_02_join = pd.get_dummies(
    df_02_join, columns=["type"], dummy_na=False, prefix="holidayType"
)
df_02_join = pd.get_dummies(
    df_02_join, columns=["locale"], dummy_na=False, prefix="holidayLocale"
)
df_02_join = pd.get_dummies(
    df_02_join, columns=["transferred"], dummy_na=False, prefix="holidayTransferred"
)

In [None]:
df_02_cleaned = df_02_join.drop(["transferred"])

In [None]:
# Set the row limit to a higher value
alt.data_transformers.disable_max_rows()

scatter_plot = (
    alt.Chart(df_02_join)
    .mark_point(filled=True, size=60)
    .encode(
        x=alt.X("date:T", title="Date"),
        y=alt.Y("sales:Q", title="Sales"),
        color=alt.condition(
            alt.datum.condition,
            alt.value("blue"),  # Non-holidays sales
            alt.value("red"),  # Holidays sales
        ),
        tooltip=["date", "sales", "condition"],
    )
    .properties(title="Sales by the Date", width=1200, height=600)
    .interactive()
)

scatter_plot.display()

In [None]:
get_unique(df_02_join, "type")

## >>  Merge holidays with stores

In [None]:
# Clean df_2 and df_5 by dropping unneeded columns to same memory and prepare for prossessing

df_2_cleaned = df_2.drop(
    columns=[
        "description"
        # , "transferred"
    ]
)

df_5_cleaned = df_5.drop(columns=["cluster", "type"])

In [None]:
# select locale 'Local' from holiday df and merge with city stores df
df_2_local = df_2_cleaned[df_2_cleaned["locale"] == "Local"]


df_25_local = df_2_local.merge(
    df_5_cleaned, left_on="locale_name", right_on="city", how="left"
)

In [None]:
# df_25_local_45 = df_25_local[df_25_local["store_nbr"] == 43]

# df_25_local_45.sample()

In [None]:
# select locale 'Regional' from holiday df and merge with state stores df
df_2_regional = df_2_cleaned[df_2_cleaned["locale"] == "Regional"]


df_25_regional = df_2_regional.merge(
    df_5_cleaned, left_on="locale_name", right_on="state", how="left"
)

In [None]:
# select locale 'National' from holiday df and merge with national stores df
df_2_national = df_2_cleaned[df_2_cleaned["locale"] == "National"]

# create extra column for merge on "Ecuador"
df_5_cleaned["national_merge"] = "Ecuador"


df_25_national = df_2_national.merge(
    df_5_cleaned,
    left_on="locale_name",
    right_on="national_merge",
    how="left",
)


df_25_national = df_25_national.drop(columns=["national_merge"])

# Combine local, regional and national dataframes into 1 merged dataframe

df_25_union = pd.concat([df_25_local, df_25_regional, df_25_national])

# clean df by dropping locale_name", "city", "state to save memory

df_25_union_cleaned = df_25_union.drop(columns=["locale_name", "city", "state"])

In [None]:
get_unique(df_2_national, "locale_name")

In [None]:
print(
    f"-> Local Contains:                {df_25_local.shape[0]} observations and {df_25_local.shape[1]} features.\n"
)
print(
    f"-> Regional Contains:             {df_25_regional.shape[0]} observations and {df_25_regional.shape[1]} features.\n"
)
print(
    f"-> National Contains:             {df_25_national.shape[0]} observations and {df_25_national.shape[1]} features.\n"
)

In [None]:
# Combine local, regional and national dataframes into 1 merged dataframe

df_25_union = pd.concat([df_25_local, df_25_regional, df_25_national])

In [None]:
# Check and compare individual df's with union dataframe
total_observations = (
    df_25_local.shape[0] + df_25_regional.shape[0] + df_25_national.shape[0]
)


print(f"-> Union should Contain:            {total_observations} observations\n")


print(
    f"-> Union Contains:                  {df_25_union.shape[0]} observations and {df_25_union.shape[1]} features.\n"
)

In [None]:
# df_25_union.sample(20)
# get_unique(df_25_union, "locale_name")

In [None]:
# clean df by dropping locale_name", "city", "state to save memory

df_25_union_cleaned = df_25_union.drop(columns=["locale_name", "city", "state"])

In [None]:
# prepare seperate df to add store type back to full df

df_5_store_type = df_5.rename(columns={"type": "store_type"}).drop(
    columns=["city", "state", "cluster"]
)

In [None]:
def fill_normal_holidays(df):
    cat_col = df.select_dtypes(include=["category"]).columns

    for col in cat_col:
        if "No" not in df[col].cat.categories:
            df[col] = df[col].cat.add_categories("No")

    df = df.fillna("No")

    return df

In [None]:
def merge_df_02(df):

    df = df.drop(columns=columns_to_drop)

    df = df.merge(df_25_union_cleaned, on=["date", "store_nbr"], how="left")

    df = df.merge(df_5_store_type, on=["store_nbr"], how="left")

    df = fill_normal_holidays(df)

    return df


# drop unnecessary columns to save memory in df's

columns_to_drop = [
    "id",
    "day",
    # "year",
    "month",
    "onpromotion",
]

In [None]:
df_02_full = merge_df_02(df_0)

df_02_full.info()

In [None]:
df_02_full.sample(10)

## Try-out code

In [None]:
df_0_2012 = df_0[(df_0["year"] == 2012)]

df_0_2012

In [None]:
def df_year_split_cleaned(df, year):

    df = df[(df["year"] == year)]

    df = df.drop(columns=columns_to_drop)

    return df


# drop unnecessary columns to save memory in df's

columns_to_drop = [
    "id",
    "day",
    "year",
    "month",
    "onpromotion",
]

In [None]:
df_year_split_cleaned(df_0, 2013)

In [None]:
# df_0_2013 = df_0[(df_0["year"] == 2013)]
# df_0_2014 = df_0[(df_0["year"] == 2014)]
# df_0_2015 = df_0[(df_0["year"] == 2015)]
# df_0_2016 = df_0[(df_0["year"] == 2016)]
# df_0_2017 = df_0[(df_0["year"] == 2017)]

In [None]:
# # drop unnecessary columns to save memory in df's

# columns_to_drop = [
#     "id",
#     "day",
#     "year",
#     "month",
#     "onpromotion",
# ]


# def clean_df_0(df):

#     df_cleaned = df.drop(columns=columns_to_drop)

#     return df_cleaned

In [None]:
# df_0_cleaned_2012 = clean_df_0(df_0_2012)
# df_0_cleaned_2013 = clean_df_0(df_0_2013)
# df_0_cleaned_2014 = clean_df_0(df_0_2014)
# df_0_cleaned_2015 = clean_df_0(df_0_2015)
# df_0_cleaned_2016 = clean_df_0(df_0_2016)
# df_0_cleaned_2017 = clean_df_0(df_0_2017)

In [None]:
df_0_cleaned_2013 = df_year_split_cleaned(df_0, 2013)

In [None]:
df_0_cleaned_2013.head(10)

In [None]:
df_25_union_cleaned.head(10)

In [None]:
def merge_df_02(df):

    df = df.drop(columns=columns_to_drop)

    df = df.merge(df_25_union_cleaned, on=["date", "store_nbr"], how="left")

    df = fill_normal_holidays(df)

    return df


# drop unnecessary columns to save memory in df's

columns_to_drop = [
    "id",
    "day",
    # "year",
    "month",
    "onpromotion",
]

df_02_full = merge_df_02(df_0)

df_02_full.info()

In [None]:
df_02_join_2013 = fill_normal_holidays(df_02_join_2013)

df_02_join_2013.tail(10)

In [None]:
df_02_join_2013.info()

In [None]:
df_02_join_2013.sample(10)

In [None]:
get_unique(df_02_join_2013, "type")

In [None]:
get_unique(df_02_join_2013, "locale")

## Plots

In [None]:
# Timeseries of Sales by Locale

aggregated_data = (
    df_02_join_2013.groupby(["date", "locale"])["unit_sales"].sum().reset_index()
)

# Timeseries of Sales by Locale
chart = (
    alt.Chart(aggregated_data)
    .mark_bar()
    .encode(
        x="date:T",
        y="unit_sales:Q",
        color="locale:N",
        tooltip=["date", "unit_sales", "locale"],
    )
    .properties(
        title="Holiday's - Timeseries of Sales by Locale", width=800, height=400
    )
    .interactive()
)

# Display the chart
chart.display()

In [None]:
# Timeseries of Sales by type

aggregated_data = (
    df_02_join_2013.groupby(["date", "type"])["unit_sales"].sum().reset_index()
)

# Timeseries of Sales by Locale
chart = (
    alt.Chart(aggregated_data)
    .mark_bar()
    .encode(
        x="date:T",
        y="unit_sales:Q",
        color="type:N",
        tooltip=["date", "unit_sales", "type"],
    )
    .properties(
        title="Holiday's - Timeseries of Sales by Holiday", width=800, height=400
    )
    .interactive()
)

chart.display()

In [None]:
# Timeseries of Sales by Type and store_nbr

aggregated_data = (
    df_02_join_2013.groupby(["date", "store_nbr", "type"])["unit_sales"]
    .sum()
    .reset_index()
)

# Timeseries of Sales by Locale
chart = (
    alt.Chart(aggregated_data)
    .mark_circle()
    .encode(
        x="date:T",
        y="unit_sales:Q",
        color="type:N",
        tooltip=["date", "unit_sales", "type", "store_nbr"],
    )
    .properties(
        title="Holiday's - Timeseries of Sales by type and store_nbr ",
        width=800,
        height=400,
    )
    .interactive()
)

chart.display()

In [None]:
## Timeseries of Sales by locale and store_nbr

aggregated_data = (
    df_02_join_2013.groupby(["date", "store_nbr", "locale"])["unit_sales"]
    .sum()
    .reset_index()
)

chart = (
    alt.Chart(aggregated_data)
    .mark_bar()
    .encode(
        x="date:T",
        y="unit_sales:Q",
        color=alt.Color(
            "store_nbr:N",
        ),
        tooltip=["date", "unit_sales", "locale", "store_nbr"],
    )
    .properties(
        title="Holiday's - Timeseries of Sales by locale and store_nbr",
        width=800,
        height=400,
    )
    .interactive()
)

chart.display()

# To-do: brainstorm on how to show effect of holidays on indv stores

In [None]:
# Timeseries of Sales by type

aggregated_data = (
    df_02_join_2013.groupby(["date"])
    .agg({"type": "first", "date": "first", "unit_sales": "sum"})
    .reset_index(drop=True)
)

# Base chart
base_aggregated_data = (
    df_02_join_2013.groupby(["date"])["unit_sales"].sum().reset_index()
)

line_chart = (
    alt.Chart(base_aggregated_data)
    .mark_line()
    .encode(
        x="date:T",
        y="unit_sales:Q",
    )
    .properties(width=800, height=400)
)

# Overlay points for types
legend_data = aggregated_data[aggregated_data["type"] != "No"]

point_chart = (
    alt.Chart(legend_data)
    .mark_point()
    .encode(
        x="date:T",
        y="unit_sales:Q",
        shape=alt.Shape("type:N", legend=alt.Legend(title="Type")),
        color="type:N",
        tooltip=["date", "unit_sales", "type"],
    )
)

# Combine charts
combined_chart = line_chart + point_chart

combined_chart = combined_chart.properties(
    title="Timeseries of Total Sales by Type",
).interactive()

combined_chart.display()

In [None]:
df_02_join_2013.sample(20)


def holiday_sales_stores_1item(df, item_nbr):

    df = df[(df["item_nbr"] == item_nbr)]

    return df

In [None]:
df_502331_2013 = holiday_sales_stores_1item(df_02_join_2013, 502331)

df_502331_2013.head(10)

##Note: item 502331 has sales on 01-01-2013 for store 25 with holiday remark, rest stores start sales on 02-01-2013

In [None]:
df_617763_2013 = holiday_sales_stores_1item(df_02_join_2013, 617763)

df_617763_2013.head(10)

df_617763_2013.sample(10)
##Note: item 617763 has no sales on 01-01-2013 for none of the stores, all stores start sales on 02-01-2013.

In [None]:
def holiday_chart_item(df, item_nbr):

    df = df[(df["item_nbr"] == item_nbr)]

    line_chart = (
        alt.Chart(df)
        .mark_line()
        .encode(
            x="date:T",
            y="unit_sales:Q",
        )
        .properties(width=800, height=400)
    )

    # Overlay points for type or locale
    legend_data = df[df["locale"] != "No"]

    point_chart = (
        alt.Chart(legend_data)
        .mark_point()
        .encode(
            x="date:T",
            y="unit_sales:Q",
            shape=alt.Shape("locale:N", legend=alt.Legend(title="Locale")),
            color="locale:N",
            tooltip=["date", "unit_sales", "locale"],
        )
    )

    # Combine charts
    combined_chart = line_chart + point_chart

    combined_chart = combined_chart.properties(
        title="Timeseries of unit Sales for item {item_nbr}",
    ).interactive()

    combined_chart.display()

In [None]:
holiday_chart_item(df_02_join_2013, 617763)

In [None]:
def holiday_chart_item(df, item_nbr):

    # Filter the DataFrame for the specific item number
    df = df[df["item_nbr"] == item_nbr]

    # Define the base chart with the data at the top level
    base = (
        alt.Chart(df)
        .encode(x="date:T", y="unit_sales:Q"),
        .properties(
            width=300,
            height=200
            )
    )

    # Create the line chart
    line_chart = base.mark_line().encode(color="store_nbr:N")

    # Overlay points for type or locale
    point_chart = (
        base.mark_point()
        .encode(
            shape=alt.Shape("locale:N", legend=alt.Legend(title="Locale")),
            color="locale:N",
            tooltip=["date", "unit_sales", "locale", "store_nbr"],
        )
        .transform_filter(
            (
                alt.datum.locale != "No"
            )  # filter out rows where locale is 'No' for the point chart
        )
    )

    # Combine line and point charts
    combined_chart = alt.layer(line_chart, point_chart)

    # Facet by store number
    facet_chart = (
        combined_chart.facet(
            facet="store_nbr:N", columns=3  # Number of columns in the facet grid
        )
        .properties(title=f"Timeseries of Unit Sales for Item {item_nbr} per store")
        .interactive()
    )

    # Display the chart
    facet_chart.display()

In [None]:
holiday_chart_item(df_02_join_2013, 617763)

In [None]:
holiday_chart_item(df_02_join_2013, 502331)

In [None]:
# def merge_df_02(df):

#     df = df.drop(columns=columns_to_drop)

#     df = df.merge(df_25_union_cleaned, on=["date", "store_nbr"], how="left")

#     df = fill_normal_holidays(df)

#     return df


# # drop unnecessary columns to save memory in df's

# columns_to_drop = [
#     "id",
#     "day",
#     # "year",
#     "month",
#     "onpromotion",
# ]

In [None]:
df_02_full = merge_df_02(df_0)


df_02_full.info()

In [None]:
+df_02_full.sample(10)

In [None]:
def holiday_chart_item_years(df, item_nbr):

    # Filter the DataFrame for the specific item number
    df = df[df["item_nbr"] == item_nbr]

    # Define the base chart with the data at the top level
    base = (
        alt.Chart(df)
        .encode(
            x="date:T",
            y="unit_sales:Q",
            color="year:N",
        )
        .properties(width=300, height=200)
    )

    # Create the line chart
    # line_chart = base.mark_line()

    line_chart = base.mark_line().encode(
        color=alt.Color("year:N", legend=alt.Legend(title="Year"))
    )

    # Overlay points for type or locale
    point_chart = (
        base.mark_point()
        .encode(
            shape=alt.Shape("locale:N", legend=alt.Legend(title="Locale")),
            color="locale:N",
            tooltip=["date", "unit_sales", "locale", "store_nbr", "year"],
        )
        .transform_filter(
            (
                alt.datum.locale != "No"
            )  # filter out rows where locale is 'No' for the point chart
        )
    )

    # Combine line and point charts
    combined_chart = alt.layer(line_chart, point_chart)

    # Facet by store number
    facet_chart = (
        combined_chart.facet(
            facet="store_nbr:N",
            columns=3,
            title=f"Timeseries of Unit Sales for Item {item_nbr} per Store per Year",
        )
        .resolve_scale(y="independent")
        .interactive()
    )

    # Display the chart
    facet_chart.display()

In [None]:
get_unique(df_02_full, "item_nbr")

In [None]:
# Plot Timeseries of Unit Sales for Item 502331 per Store per Year
holiday_chart_item_years(df_02_full, 119024)

##note: lot of spikes!

In [None]:
holiday_chart_item_years(df_02_full, 119024)

In [None]:
get_unique(df_02_full, "type")

In [None]:
df_02_full["month"] = df_02_full["date"].dt.month
df_02_full["month"] = pd.to_datetime(df_02_full["date"]).dt.month

df_02_full["type"] = df_02_full["type"].replace("No", np.nan)

# data
df_y_m_ht = (
    df_02_full.groupby(["year", "month", "type"])
    .agg({"unit_sales": "mean"})
    .reset_index()
)
df_y_m_ht["unit_sales"] = round(df_y_m_ht["unit_sales"], 2)

# month mapping
month_mapping = {
    1: "Jan",
    2: "Feb",
    3: "Mar",
    4: "Apr",
    5: "May",
    6: "Jun",
    7: "Jul",
    8: "Aug",
    9: "Sep",
    10: "Oct",
    11: "Nov",
    12: "Dec",
}
df_y_m_ht["month"] = df_y_m_ht["month"].map(month_mapping)

# base chart
base = (
    alt.Chart(df_y_m_ht)
    .mark_point(filled=True)
    .encode(
        x=alt.X(
            "month:O",
            title="",
            sort=list(month_mapping.values()),
            axis=alt.Axis(labelAngle=0),
        ),
        y=alt.Y("type:N", title=""),
        color=alt.Color("unit_sales:Q", scale=alt.Scale(scheme="inferno")),
        size=alt.Size("unit_sales:Q"),
        tooltip=["year", "month", "type", "unit_sales"],
    )
    .properties(
        width=600,
        height=200,
    )
)


# facet chart
chart = base.facet(row="year:N").properties(
    title=alt.TitleParams(
        "Average Sales: Holiday_type Vs Year (Month)",
    ),
)

chart.show()

In [None]:
df_02_full["month"] = df_02_full["date"].dt.month
df_02_full["month"] = pd.to_datetime(df_02_full["date"]).dt.month

df_02_full["type"] = df_02_full["type"].replace("No", np.nan)


# Data aggregation
df_y_m_st = (
    df_02_full.groupby(["year", "month", "store_type"])
    .agg({"unit_sales": "sum"})
    .reset_index()
)

df_y_m_st["unit_sales"] = round(df_y_m_st["unit_sales"], 2)

# Month mapping
month_mapping = {
    1: "Jan",
    2: "Feb",
    3: "Mar",
    4: "Apr",
    5: "May",
    6: "Jun",
    7: "Jul",
    8: "Aug",
    9: "Sep",
    10: "Oct",
    11: "Nov",
    12: "Dec",
}
df_y_m_st["month"] = df_y_m_st["month"].map(month_mapping)

# Base chart
base = (
    alt.Chart(df_y_m_st)
    .mark_point(filled=True)
    .encode(
        x=alt.X(
            "month:O",
            title="Month",
            sort=list(month_mapping.values()),
            axis=alt.Axis(labelAngle=0, grid=True),
        ),
        y=alt.Y("store_type:N", title="Store Type", axis=alt.Axis(grid=True)),
        color=alt.Color("unit_sales:Q", scale=alt.Scale(scheme="inferno")),
        size=alt.Size("unit_sales:Q"),
        tooltip=["year", "month", "store_type", "unit_sales"],
    )
    .properties(
        width=600,
        height=200,
    )
)

# Facet chart
chart = base.facet(row="year:N").properties(
    title=alt.TitleParams(
        "Average Sales: Store Type vs Month  ",
    )
)

# Display chart
chart.show()

In [None]:
df_02_full[df_02_full["store_nbr"] == 45].sample(5)

In [None]:
# df_test = df_02_full[df_02_full["store_nbr"] == 45]

# # aggregated_df = (
# #         df_test.groupby(["date", "type"]).agg({"unit_sales": "sum"}).reset_index()
# #     )

# aggregated_df = (
#     df_test.groupby(["date"])
#     .agg({"type": "first", "date": "first", "unit_sales": "sum"})
#     .reset_index(drop=True)
# )

# aggregated_df.tail(10)

In [None]:
df = df_02_full
store_nbr = 45
x_days = 5

# Select data from specific store
df = df[df["store_nbr"] == store_nbr]

# Aggegrate unit_sales summed up per day
aggregated_df = (
    df.groupby(["date"])
    .agg({"type": "first", "date": "first", "unit_sales": "sum"})
    .reset_index(drop=True)
)

# Filter the data to exclude 'No' in the 'type' column and select the 'date' column
holiday_dates = aggregated_df[aggregated_df["type"] != "No"]["date"]

# Initialize an empty DataFrame to collect the results
result_df = pd.DataFrame()

# Iterate through each holiday date and extract the required range
for holiday_date in holiday_dates:
    date_range = pd.date_range(
        start=holiday_date - pd.Timedelta(days=x_days),
        end=holiday_date + pd.Timedelta(days=x_days),
    )

    temp_df = aggregated_df.loc[
        aggregated_df.index.intersection(date_range)
    ].reset_index()
    temp_df["holiday_date"] = holiday_date
    temp_df["days_from_holiday"] = (temp_df["date"] - holiday_date).dt.days
    result_df = pd.concat([result_df, temp_df])

    # Reset the index for the combined DataFrame
    result_df.reset_index(drop=True, inplace=True)

    # Create the Altair plot
    chart = (
        alt.Chart(result_df)
        .mark_line(point=True)
        .encode(
            x=alt.X("days_from_holiday:Q", title=f"{x_days} Days from Holiday"),
            y=alt.Y("unit_sales:Q", title="Unit Sales"),
            color=alt.Color("type:N", title="Holiday Type"),
            tooltip=[
                "unit_sales",
                "date",
                "type",
                "store_type",
                "holiday_date",
                "days_from_holiday",
            ],
        )
        .properties(
            width=800,
            height=400,
            title=f"Sales for {x_days} Days Before and After Holiday for Store {store_nbr}",
        )
    )

    chart.show()

In [None]:
df = df_02_full
store_nbr = 45
x_days = 5

# Select data from specific store
df = df[df["store_nbr"] == store_nbr]

# Aggegrate unit_sales summed up per day
aggregated_df = (
    df.groupby(["date"])
    .agg({"type": "first", "date": "first", "unit_sales": "sum"})
    .reset_index(drop=True)
)

# Set date column as Index
aggregated_df.set_index("date", inplace=True)

# Filter the data to exclude 'No' in the 'type' column and select the 'date' column
holiday_dates = aggregated_df[aggregated_df["type"] != "No"]["date"]

holiday_dates.head(20)

# Initialize an empty DataFrame to collect the results
result_df = pd.DataFrame()

# Iterate through each holiday date and extract the required range
for holiday_date in holiday_dates:
    date_range = pd.date_range(
        start=holiday_date - pd.Timedelta(days=x_days),
        end=holiday_date + pd.Timedelta(days=x_days),
    )

    print(date_range)  # --> geeft list met juiste dates

    print(aggregated_df.head(10))  # --> aggregated_df bevat juiste data

    # Use boolean indexing to filter the date range
    temp_df = aggregated_df.loc[date_range].reset_index()

    temp_df["holiday_date"] = holiday_date

    print(holiday_date)  # --> Shows the correct holiday date

    print(temp_df.head(10))  # --> temp_df should no longer be empty

    temp_df["days_from_holiday"] = (temp_df["date"] - holiday_date).dt.days

    print(temp_df["days_from_holiday"])

    result_df = pd.concat([result_df, temp_df])

    print(temp_df.head(10))

    # Reset the index for the combined DataFrame
    # result_df.reset_index(drop=True, inplace=True)

result_df.head(20)

In [None]:
# Assuming df_02_full is already defined
df = df_02_full
store_nbr = 45
x_days = 5

# Select data from specific store
df = df[df["store_nbr"] == store_nbr]

# Aggregate unit_sales summed up per day
aggregated_df = (
    df.groupby("date").agg({"type": "first", "unit_sales": "sum"}).reset_index()
)

# Set date column as Index
aggregated_df.set_index("date", inplace=True)

# Filter the data to exclude 'No' in the 'type' column and select the 'date' column
holiday_dates = aggregated_df[aggregated_df["type"] != "No"].index

# Initialize an empty DataFrame to collect the results
result_df = pd.DataFrame()

# Iterate through each holiday date and extract the required range
for holiday_date in holiday_dates:
    # Create a date range with a buffer of x_days around the holiday_date
    date_range = pd.date_range(
        start=holiday_date - pd.Timedelta(days=x_days),
        end=holiday_date + pd.Timedelta(days=x_days),
    )

    # Filter aggregated_df for dates within the date_range
    # temp_df = aggregated_df.loc[date_range].reset_index()

    temp_df = aggregated_df.loc[
        aggregated_df.index.intersection(date_range)
    ].reset_index()

    # Ensure the 'date' column is accessible and not renamed to 'index'
    temp_df.rename(columns={"index": "date"}, inplace=True)

    # Filter aggregated_df for dates within the date_range
    #

    # print(temp_df.head(10))

    # Ensure the 'date' column is accessible
    if "index" not in temp_df.columns:
        raise KeyError("'date' column not found in temp_df after reset_index()")

    temp_df["holiday_date"] = holiday_date
    temp_df["days_from_holiday"] = (temp_df["date"] - holiday_date).dt.days

    # Append temp_df to result_df
    result_df = pd.concat([result_df, temp_df], ignore_index=True)

    print(result_df.info())

    print("-" * 60)

    print(result_df.head(10))

    # Create the Altair plot
    chart = (
        alt.Chart(result_df)
        .mark_line(point=True)
        .encode(
            x=alt.X("days_from_holiday:Q", title=f"Date as {x_days} Days from Holiday"),
            y=alt.Y("unit_sales:Q", title="Unit Sales"),
            color=alt.Color("type:N", title="Holiday Type"),
            tooltip=[
                "unit_sales",
                "date",
                "type",
                # "store_type",
                "holiday_date",
                "days_from_holiday",
            ],
        )
        .properties(
            width=800,
            height=400,
            title=f"Sales for {x_days} Days Before and After Holiday for Store {store_nbr}",
        )
    )

    chart.show()

    print("-+" * 30)

# Reset the index for the combined DataFrame
result_df.reset_index(drop=True, inplace=True)

In [None]:
df = df_02_full
store_nbr = 45
x_days = 5

# Select data from specific store
df = df[df["store_nbr"] == store_nbr]

# Aggegrate unit_sales summed up per day
aggregated_df = (
    df.groupby(["date"])
    .agg({"type": "first", "date": "first", "unit_sales": "sum"})
    .reset_index(drop=True)
)

# Filter the data to exclude 'No' in the 'type' column and select the 'date' column
# holiday_dates = aggregated_df[aggregated_df["type"] != "No"]["date"]

# holiday_dates.describe()
# --> for store 45 there are 146 holiday sales-dates


aggregated_df = aggregated_df[aggregated_df["type"] != "No"]

aggregated_df.head(10)

In [None]:
# Aggregate data and set index
aggregated_df = df.groupby(["date", "type"]).agg({"unit_sales": "sum"}).reset_index()

# Filter the data to exclude 'No' in the 'type' column and select the 'date' column
test2 = aggregated_df[aggregated_df["type"] != "No"]["date"]

# Display the first 10 entries
test2.head(10)

In [None]:
def Xdays_holiday_sales(df, store_nbr, x_days):

    # Select data from specific store
    df = df[df["store_nbr"] == store_nbr]

    # Aggegrate unit_sales summed up per day
    aggregated_df = (
        df.groupby(["date"])
        .agg({"type": "first", "date": "first", "unit_sales": "sum"})
        .reset_index(drop=True)
    )

    # Set date as index for easier date range operations
    aggregated_df.set_index("date", inplace=True)

    # Extract the relevant dates for holidays
    holiday_dates = aggregated_df[aggregated_df["type"].notna()]["date"]

    # Initialize an empty DataFrame to collect the results
    result_df = pd.DataFrame()

    # Iterate through each holiday date and extract the required range
    for holiday_date in holiday_dates:
        date_range = pd.date_range(
            start=holiday_date - pd.Timedelta(days=x_days),
            end=holiday_date + pd.Timedelta(days=x_days),
        )

        temp_df = aggregated_df.loc[
            aggregated_df.index.intersection(date_range)
        ].reset_index()
        temp_df["holiday_date"] = holiday_date
        temp_df["days_from_holiday"] = (temp_df["date"] - holiday_date).dt.days
        result_df = pd.concat([result_df, temp_df])

    # Reset the index for the combined DataFrame
    result_df.reset_index(drop=True, inplace=True)

    # Create the Altair plot
    chart = (
        alt.Chart(result_df)
        .mark_line(point=True)
        .encode(
            x=alt.X("days_from_holiday:Q", title=f"{x_days} Days from Holiday"),
            y=alt.Y("unit_sales:Q", title="Unit Sales"),
            color=alt.Color("type:N", title="Holiday Type"),
            tooltip=[
                "unit_sales",
                "date",
                "type",
                "locale",
                "store_type",
                "holiday_date",
                "days_from_holiday",
            ],
        )
        .properties(
            width=800,
            height=400,
            title=f"Sales for {x_days} Days Before and After Holiday for Store {store_nbr}",
        )
    )

    chart.show()

    return chart

In [None]:
Xdays_holiday_sales(df_02_full, store_nbr=45, x_days=5)