### Supermarket data science case study - Test en junk notebook for writing data prep pipeline


# 0. Always run step 0

### >> Importing packages

In [1]:
import pandas as pd
import numpy as np

import sys
import altair as alt

import vegafusion as vf

import sklearn

from datetime import datetime
from sklearn.pipeline import Pipeline, make_pipeline

import time

### >> Functions

In [2]:
def f_concat(l_input):

    # Initialize.
    dummy = ""
    n_len = len(l_input)

    if n_len == 1:
        return l_input[0]

    # Loop through text elements.
    for i in range(n_len - 1):
        dummy = dummy + l_input[i] + ", "

    # Append last element.
    dummy = dummy + "and " + l_input[n_len - 1]

    # Return result.
    return dummy

In [3]:
def basic_description(df):
    print(
        f"-> Contains:                {round(df.shape[0]/1e6, 4)} million observations and has {df.shape[1]} Feature names: {f_concat(df.columns)}.\n"
    )
    print(
        f"-> Has size of              {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB."
    )

### >> Downcast and transform data
Update formatting of features to optimize memory and standardize column names.

In [4]:
def standardize_column_names(s):
    return s.replace(" ", "")


def optimize_memory(df):
    # Change: Objects to Categorical.
    object_cols = df.select_dtypes(include="object").columns

    if not object_cols.empty:
        print("Change: Objects to Categorical")
        df[object_cols] = df[object_cols].astype("category")

    # Change: Convert integers to smallest unsigned integer and floats to smallest.
    for old, new in [("integer", "unsigned"), ("float", "float")]:
        print("Change: " + old + " --> " + new)
        for col in df.select_dtypes(include=old).columns:
            df[col] = pd.to_numeric(df[col], downcast=new)

    return df


def month_year_to_int(df, i):
    # Change: Month and Year to integer.
    if i == 0:
        print("Change: Month and Year to integer")
        df = df.astype({"month": int, "year": int})

    return df

### >> Transform date-related columns to datetime format.

In [5]:
# Convert datasets to time series
def transform_date_to_datetime(df, i):
    if i == 0:
        print("Change: Transformed 'year', 'month', 'day' columns to Datetime feature")
        # Convert year, month, and day to a single datetime column
        df["date"] = pd.to_datetime(df[["year", "month", "day"]]).dt.floor("D")

        print(
            "Change: Dropped 'id', 'month', 'day' columns and transformed to Datetime feature"
        )
        df.drop(columns=["day", "month", "id"], inplace=True)

    else:
        if "date" in df.columns:
            print("Change: Transformed 'date' column to Datetime Dtype")
            df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None).dt.floor("D")

    return df

### >> Import data from local PATH
Import data trough pipeline to downcast the data and transformations

In [6]:
def f_get_data(i=0):

    # Define path.
    c_path = "C:/Users/alexander/Documents/0. Data Science and AI for Experts/EAISI_4B_Supermarket/data/raw/"

    # Identify file.
    v_file = (
        "history-per-year",  # 0
        "history_aggregated",  # 1
        "holidays_events",  # 2
        "items",  # 3
        "oil",  # 4
        "stores",  # 5
        "transactions",  # 6
    )

    # Load data.
    df = (
        pd.read_parquet(c_path + v_file[i] + ".parquet")
        .rename(columns=standardize_column_names)
        .pipe(optimize_memory)
        .pipe(month_year_to_int, i)
        .pipe(transform_date_to_datetime, i)
    )

    # Return data.
    return df

### >> Importing data

In [7]:
# Sales History per year
df_sales = f_get_data(0)

Change: integer --> unsigned
Change: float --> float
Change: Month and Year to integer
Change: Transformed 'year', 'month', 'day' columns to Datetime feature
Change: Dropped 'id', 'month', 'day' columns and transformed to Datetime feature


In [8]:
# Holidays
df_holidays = f_get_data(2)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float
Change: Transformed 'date' column to Datetime Dtype


In [9]:
# Items
df_items = f_get_data(3)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float


In [10]:
# Stores
df_stores = f_get_data(5)

Change: Objects to Categorical
Change: integer --> unsigned
Change: float --> float


# 1. Imputing sales data

In [None]:
df_sales = f_get_data(0)

# df_0 = df_0[df_0["year"].isin([2014, 2015])].drop(columns=['id', 'day','month' ])
df_sales = df_sales.drop(columns=["id", "day", "month"])
print("-" * 50)
df_sales.info()

In [None]:
# To check how big df wil get after NaN inputing per year
df_sales_2013 = df_sales[(df_sales["year"] == 2013)]
df_sales_2014 = df_sales[(df_sales["year"] == 2014)]
df_sales_2015 = df_sales[(df_sales["year"] == 2015)]
df_sales_2016 = df_sales[(df_sales["year"] == 2016)]
df_sales_2017 = df_sales[(df_sales["year"] == 2017)]

In [None]:
basic_description(df_sales_2017)  # --> 23.8 million observations and 0.69 GB

In [None]:
basic_description(df_sales)  # orginal = 2.7GB and after NaN inputing ~7GB

In [None]:
df_0_36_648313 = df_0[
    (df_0["store_nbr"] == 53)
    & (
        df_0["item_nbr"].isin(
            [
                627887,
                759890,
            ]
        )
    )
].drop(
    columns=[
        "weekday",
        "week_number",
        "week_year",
        "weeks_since_start",
        "weeks_since_start_2",
    ]
)
# 53 759890 --> starts 2014-05-29, next date 2024-05-31

In [None]:
df_0_36_759890 = df_0[(df_0["store_nbr"] == 53) & (df_0["item_nbr"] == 759890)]

In [None]:
df_0_36_3items_3stores = df_0[
    (df_0["store_nbr"].isin([53, 6]))
    & (df_0["item_nbr"].isin([627887, 759890, 1160872]))
].drop(columns=["id"])

df_final_check2 = df_sales_nan[(df_sales_nan["date"] > "2015-31-12")].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)

df_final_check2.head(60)

In [None]:
df_0_36_3items_3stores.tail(40).sort_values(by=["date", "store_nbr", "item_nbr"])

In [None]:
df_0_36_3items_3stores = filling_dates_NaN(df_0_36_3items_3stores)

In [None]:
df_final_check = df_0_36_3items_3stores[
    (df_0_36_3items_3stores["date"] > "2015-09-12")
].sort_values(by=["date", "store_nbr", "item_nbr"])

df_final_check.head(60)

In [None]:
df_0_36_759890.head(10)

In [None]:
df_0_36_648313.tail(10)

In [None]:
def filling_dates_NaN(df):

    # Create new df to include all daily dates in the range, filling missing dates with NaNs
    df = df.copy()

    # Print first and last date of df
    print(f'First date in df: {df["date"].min()}')
    print(f'Last date in df:  {df["date"].max()}')
    print("-" * 71)

    # Calculate memory size and shape size of start df
    df_mem_start = sys.getsizeof(df)
    df_shape_start = df.shape[0] / 1e6
    print(
        f"Start size of df:     {round(df_mem_start/1024/1024/1024, 2)} GB and start observations:     {round(df_shape_start, 1)} million."
    )

    # Create a complete date range for the entire dataset
    all_dates = pd.date_range(start=df["date"].min(), end=df["date"].max(), freq="D")

    # Create a multi-index from all possible combinations of 'item_nbr' and 'date'
    all_combinations = pd.MultiIndex.from_product(
        [df["store_nbr"].unique(), df["item_nbr"].unique(), all_dates],
        names=["store_nbr", "item_nbr", "date"],
    )

    # Reindex the original DataFrame to include all combinations of 'store_nbr', 'item_nbr', and 'date'
    df_reindexed = df.set_index(["store_nbr", "item_nbr", "date"]).reindex(
        all_combinations
    )

    # Reset the index to turn the multi-index back into regular columns
    df_final = df_reindexed.reset_index()

    # Calculate memory size and shape size of final end df
    df_mem_end = sys.getsizeof(df_final)
    df_mem_change_perc = ((df_mem_end - df_mem_start) / df_mem_start) * 100
    df_mem_change = df_mem_end - df_mem_start

    df_shape_end = df_final.shape[0] / 1e6
    df_shape_change_perc = ((df_shape_end - df_shape_start) / df_shape_start) * 100
    df_shape_change = df_shape_end - df_shape_start

    print(
        f"Final size of df:     {round(df_mem_end/1024/1024/1024, 2)} GB and end observations:       {round(df_shape_end, 1)} million."
    )
    print(
        f"Change in size of df: {round(df_mem_change_perc, 2)} % and observations:           {round(df_shape_change_perc, 2)}     %."
    )
    print(
        f"Increased size of df: {round(df_mem_change/1024/1024/1024, 2)} GB and increased observations: {round(df_shape_change, 1)} million."
    )

    print("-" * 71)

    return df_final

In [None]:
print(f"--> df_sales_2013 Contains:")
df_sales_2013_nan = filling_dates_NaN(df_sales_2013)

In [None]:
print(f"--> df_sales_2014 Contains:")
df_sales_2014_nan = filling_dates_NaN(df_sales_2014)

In [None]:
print(f"--> df_sales_2015 Contains:")
df_sales_2015_nan = filling_dates_NaN(df_sales_2015)

In [None]:
print(f"--> df_sales_2016 Contains:")
df_sales_2016_nan = filling_dates_NaN(df_sales_2016)

In [None]:
print(f"--> df_sales_2017 Contains:")
df_sales_2017_nan = filling_dates_NaN(df_sales_2017)

In [None]:
df_final_check = df_sales_nan[(df_sales_nan["date"] > "2015-30-12")].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)

df_final_check.head(60)

In [None]:
df_final_check2 = df_sales_nan[(df_sales_nan["date"] > "2015-31-12")].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)

df_final_check2.head(60)

In [None]:
df_sales_nan.head(30).sort_values(by=["date", "store_nbr", "item_nbr"])

In [None]:
df_sales_nan.tail(20)

In [None]:
df_sales_nan.sample(40)

In [None]:
# -----------------------------------------------------------

df = df_0_36_648313.copy()

# Create a complete date range for the entire dataset
all_dates = pd.date_range(start="2014-01-01", end="2016-01-18", freq="D")

# Create a multi-index from all possible combinations of 'store_nbr', 'item_nbr', and 'date'
all_combinations = pd.MultiIndex.from_product(
    [df["store_nbr"].unique(), df["item_nbr"].unique(), all_dates],
    names=["store_nbr", "item_nbr", "date"],
)

# Reindex the original DataFrame to include all combinations of 'store_nbr', 'item_nbr', and 'date'
df_reindexed = df.set_index(["store_nbr", "item_nbr", "date"]).reindex(all_combinations)

# Reset the index to turn the multi-index back into regular columns
df_final = df_reindexed.reset_index()

In [None]:
df_final.head(10)

In [None]:
df_final.tail(10)

In [None]:
df_final.sample(10)

In [None]:
# Note: Fill missing values for non-sales columns using forward fill and backward fill
non_sales_columns = [
    "onpromotion",
    "day",
    "year",
    "month",
    "item_family",
    "item_class",
    "store_cluster",
]
df[non_sales_columns] = df.groupby(["store_nbr", "item_nbr"])[non_sales_columns].apply(
    lambda group: group.ffill().bfill()
)

# Interpolate missing values for the 'unit_sales' column
df["unit_sales"] = df.groupby(["store_nbr", "item_nbr"])["unit_sales"].apply(
    lambda group: group.interpolate(method="linear")
)

In [None]:
df_0.sample(30)

In [None]:
def get_unique(df, column_name):
    """Get the all values and the count for specific column"""
    unique_values_count = df[column_name].nunique()
    unique_values = df[column_name].unique()

    # Convert unique values to a single string to print
    unique_values_str = ", ".join(map(str, unique_values))

    print(f"Number of unique values in {column_name}: {unique_values_count}")
    print("Unique values:")
    print(unique_values_str)

    return

## Detect negative values

•	Action: Delete unit_sales if values are lower than zero --> N/A

In [None]:
df_sales = f_get_data(0)
df_sales = df_sales.drop(columns=["id", "day", "month"])
print("-" * 80)
df_sales.info()

In [None]:
df_sales_negative = df_sales[df_sales["unit_sales"] < 0].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)
df_sales_negative.info()

Some CLAUDE.ai generated plots, to get a bit of an idea how the negative values look like

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def visualize_negative_sales(df_sales_negative):

    # Check if the DataFrame is empty
    if df_sales_negative.empty:
        print("No negative sales data to visualize.")
        return

    # Create a line plot of negative sales over time
    plt.figure(figsize=(12, 6))
    sns.lineplot(x="date", y="unit_sales", data=df_sales_negative)
    plt.title("Negative Sales Over Time")
    plt.xlabel("Date")
    plt.ylabel("Unit Sales")
    plt.show()

    # # Create a bar plot of negative sales by store and item
    # plt.figure(figsize=(12, 6))
    # sns.barplot(x="store_nbr", y="unit_sales", hue="item_nbr", data=df_sales_negative)
    # plt.title("Negative Sales by Store and Item")
    # plt.xlabel("Store Number")
    # plt.ylabel("Unit Sales")
    # plt.xticks(rotation=90)
    # plt.show()

In [None]:
df_sales_negative = df_sales[df_sales["unit_sales"] < 0].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)
visualize_negative_sales(df_sales_negative)

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(data=df_sales_negative, x="unit_sales", bins=200)
plt.title("Distribution of Negative Sales Amounts")
plt.xlabel("Unit Sales")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_sales_negative, x="store_nbr", y="unit_sales")
plt.title("Negative Sales by Store Number")
plt.xlabel("Store Number")
plt.ylabel("Unit Sales")
plt.show()

In [None]:
# Calculate the 20th percentile of the negative sales values
negative_sales_threshold = df_sales_negative["unit_sales"].quantile(0.001)

# Filter the 20% most negative sales
df_sales_negative_filtered = df_sales_negative[
    df_sales_negative["unit_sales"] <= negative_sales_threshold
]

# Create the boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_sales_negative_filtered, x="item_nbr", y="unit_sales")
plt.title("Negative Sales by Item Number (Top 20% Most Negative)")
plt.xlabel("Item Number")
plt.ylabel("Unit Sales")
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
pivot_table = df_sales_negative.pivot_table(
    index="month", columns="store_nbr", values="unit_sales"
)
sns.heatmap(pivot_table, annot=True, cmap="RdBu_r")
plt.title("Negative Sales by Month and Store Number")
plt.xlabel("Store Number")
plt.ylabel("Month")
plt.show()

Testing and research negative values

In [None]:
df_sales_negative = df_sales[df_sales["unit_sales"] < 0].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)

# --> 7795 records with negative values

In [None]:
df_sales = f_get_data(0)
df_sales = df_sales.drop(columns=["id", "day", "month"])
df_sales.info()

In [None]:
# Check the number of negative values before replacement
before_replacement = (df_sales["unit_sales"] < 0).sum()
print(f"Number of negative values before replacement: {before_replacement}")

# df_sales = df_sales[df_sales["unit_sales"] < 0]

# Detect and replace negative values with NAs --> .transform() --> should more efficient for larger df's then apply() --> takes 20 min to run for full df_sales and then crashes....
df_sales["unit_sales"] = df_sales.groupby(["date", "store_nbr", "item_nbr"])[
    "unit_sales"
].transform(lambda x: x.where(x >= 0, np.nan))

# Check the number of negative values after replacement
after_replacement = (df_sales["unit_sales"] < 0).sum()

print(f"Number of negative values after replacement: {after_replacement}")

In [None]:
# Check the number of negative values before replacement
before_replacement = (df_sales["unit_sales"] < 0).sum()
print(f"Number of negative values before replacement: {before_replacement}")

# Create a boolean mask for the negative sales rows to create df containing all negative rows, used to filter full df_sales df
negative_sales_mask = df_sales["unit_sales"] < 0

# Use the mask to update the 'unit_sales' column in the original DataFrame
df_sales.loc[negative_sales_mask, "unit_sales"] = df_sales.loc[
    negative_sales_mask, "unit_sales"
].where(df_sales.loc[negative_sales_mask, "unit_sales"] >= 0, np.nan)

# Check the number of negative values after replacement
after_replacement = (df_sales["unit_sales"] < 0).sum()
print(f"Number of negative values after replacement: {after_replacement}")

In [None]:
def negative_sales_cleaned(df):

    # Check the number of negative values before replacement
    before_replacement = (df["unit_sales"] < 0).sum()
    print(f"Number of negative values before replacement: {before_replacement}")

    # Create a boolean mask for the negative sales rows to create a 'boolean flag-list' containing all negative rows, used to filter full df_sales df
    negative_sales_mask = df["unit_sales"] < 0

    # Use the mask to update the flagged 'unit_sales' column in the original DataFrame
    df.loc[negative_sales_mask, "unit_sales"] = df.loc[
        negative_sales_mask, "unit_sales"
    ].where(df.loc[negative_sales_mask, "unit_sales"] >= 0, np.nan)

    # Check the number of negative values after replacement
    after_replacement = (df["unit_sales"] < 0).sum()
    print(f"Number of negative values after replacement: {after_replacement}")

    return df

In [None]:
negative_sales_mask

In [None]:
df_sales_nan = negative_sales_cleaned(df_sales)

df_sales_nan.info()

# 2. NaN data inputing testing

In [None]:
df_sales_nan_check = df_sales_nan[df_sales_nan["unit_sales"].isna()].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)

df_sales_nan_check.head(30)

In [None]:
df_sales_nan.tail(30)

In [None]:
df_sales_nan.sample(50)

# 3. Fill missing values for non-sales columns using forward fill and backward fill --> items, stores, holidays 

In [None]:
# Items
df_items = f_get_data(3)

In [None]:
df_sales_2013 = df_sales[(df_sales["year"] == 2013)]
df_sales_2014 = df_sales[(df_sales["year"] == 2014)]
df_sales_2015 = df_sales[(df_sales["year"] == 2015)]
df_sales_2016 = df_sales[(df_sales["year"] == 2016)]
df_sales_2017 = df_sales[(df_sales["year"] == 2017)]

In [None]:
print(f"--> df_sales_2013 Contains:")

print(df_sales_2013["item_nbr"].dtype)
print(df_items["item_nbr"].dtype)

# # Change the dtype for item_nbr from uint32 to int32
df_sales_2013["item_nbr"] = df_sales_2013["item_nbr"].astype(int)
df_items["item_nbr"] = df_items["item_nbr"].astype(int)
print("-" * 30)
print(df_sales_2013["item_nbr"].dtype)
print(df_items["item_nbr"].dtype)
print("-" * 30)


df_sales_2013_stores = df_sales_2013.merge(df_stores, on="store_nbr", how="left")

df_sales_2013_stores_items = df_sales_2013_stores.merge(
    df_items, on="item_nbr", how="left"
)

df_sales_2013_nan = filling_dates_NaN(df_sales_2013_stores_items)

In [None]:
print(f"--> df_sales_2017 Contains:")
df_sales_2017_nan = filling_dates_NaN(df_sales_2017)

# 4 Check Results of NaN inputing : 
# TO-DO: Check grouppy item-nbr, store-nbr or both #    ["item_nbr", "store_nbr"]

to-do check same item for different stores


to-do: check same store for different items

In [None]:
df_sales_2013_nan.head(20)

df sales for year 2013

forward and backward fill based on --> groupby()= item_nbr

In [None]:
df_sales_2013_nan_item_nbr = df_sales_2013_nan.copy()

# Fill missing values for non-sales columns using forward fill and backward fill --> items, stores, holidays


non_sales_columns = [
    "city",
    "state",
    "type",
    "cluster",
    "family",
    "class",
    "perishable",
]

# Filling missing values based on available values for the same !!item_nbr!!

df_sales_2013_nan_item_nbr[non_sales_columns] = df_sales_2013_nan_item_nbr.groupby(
    ["item_nbr"]
)[non_sales_columns].transform(lambda group: group.ffill().bfill())

df_sales_2013_nan_item_nbr.head(20)

# TO-DO: Check grouppy item-nbr, store-nbr or both #    ["item_nbr", "store_nbr"]
#

In [None]:
df_sales_2013_nan_item_nbr_627887 = df_sales_2013_nan_item_nbr[
    # (df_0["store_nbr"].isin([53, 6])) &
    (
        df_sales_2013_nan_item_nbr["item_nbr"].isin(
            [
                627887
                # , 759890, 1160872
            ]
        )
    )
].sort_values(by=["date", "store_nbr"])

df_sales_2013_nan_item_nbr_627887.tail(
    60
)  # --> item_nbr works well for 1 items, across all stores

df sales for year 2013

forward and backward fill based on --> groupby()= store_nbr

In [None]:
df_sales_2013_nan_store_nbr = df_sales_2013_nan.copy()

# Fill missing values for non-sales columns using forward fill and backward fill --> items, stores, holidays
non_sales_columns = [
    "city",
    "state",
    "type",
    "cluster",
    "family",
    "class",
    "perishable",
]

# Filling missing values based on available values for the same !!store_nbr!!
df_sales_2013_nan_store_nbr[non_sales_columns] = df_sales_2013_nan_store_nbr.groupby(
    ["store_nbr"]
)[non_sales_columns].transform(lambda group: group.ffill().bfill())

df_sales_2013_nan_store_nbr.head(20)

df sales for year 2013

forward and backward fill based on --> groupby()= item_nbr AND! store_nbr

In [None]:
df_sales_2013_nan_item_store_nbr = df_sales_2013_nan.copy()

# Fill missing values for non-sales columns using forward fill and backward fill --> items, stores, holidays
non_sales_columns = [
    "city",
    "state",
    "type",
    "cluster",
    "family",
    "class",
    "perishable",
]

# TO-DO: Check grouppy item-nbr, store-nbr or both #    ["item_nbr", "store_nbr"]

# Filling missing values based on available values for the same !!store_nbr, item_nbr!!
df_sales_2013_nan_item_store_nbr[non_sales_columns] = (
    df_sales_2013_nan_item_store_nbr.groupby(["item_nbr", "store_nbr"])[
        non_sales_columns
    ].transform(lambda group: group.ffill().bfill())
)

df_sales_2013_nan_item_store_nbr.head(20)

# 5 Define new, old and closed stores for 4.3

•	Condition: sales for all items a given store and date are NA

•	Action: Impute with 0

----------------------------------------------------------------------------------------

Sum/Agg all sales group  by store, date

 --> Sum_sales > 0 then store_opened

 --> first date store_opened --> before keep N/A
 
 else --> closed --> inputed with 0


In [11]:
def filling_dates_NaN(df):

    # Create new df to include all daily dates in the range, filling missing dates with NaNs
    df = df.copy()

    # Print first and last date of df
    print(f'First date in df: {df["date"].min()}')
    print(f'Last date in df:  {df["date"].max()}')
    print("-" * 71)

    # Calculate memory size and shape size of start df
    df_mem_start = sys.getsizeof(df)
    df_shape_start = df.shape[0] / 1e6
    print(
        f"Start size of df:     {round(df_mem_start/1024/1024/1024, 2)} GB and start observations:     {round(df_shape_start, 1)} million."
    )

    # Create a complete date range for the entire dataset
    all_dates = pd.date_range(start=df["date"].min(), end=df["date"].max(), freq="D")

    # Create a multi-index from all possible combinations of 'item_nbr' and 'date'
    all_combinations = pd.MultiIndex.from_product(
        [df["store_nbr"].unique(), df["item_nbr"].unique(), all_dates],
        names=["store_nbr", "item_nbr", "date"],
    )

    # Reindex the original DataFrame to include all combinations of 'store_nbr', 'item_nbr', and 'date'
    df_reindexed = df.set_index(["store_nbr", "item_nbr", "date"]).reindex(
        all_combinations
    )

    # Reset the index to turn the multi-index back into regular columns
    df_final = df_reindexed.reset_index()

    # Calculate memory size and shape size of final end df
    df_mem_end = sys.getsizeof(df_final)
    df_mem_change_perc = ((df_mem_end - df_mem_start) / df_mem_start) * 100
    df_mem_change = df_mem_end - df_mem_start

    df_shape_end = df_final.shape[0] / 1e6
    df_shape_change_perc = ((df_shape_end - df_shape_start) / df_shape_start) * 100
    df_shape_change = df_shape_end - df_shape_start

    print(
        f"Final size of df:     {round(df_mem_end/1024/1024/1024, 2)} GB and end observations:       {round(df_shape_end, 1)} million."
    )
    print(
        f"Change in size of df: {round(df_mem_change_perc, 2)} % and observations:           {round(df_shape_change_perc, 2)}     %."
    )
    print(
        f"Increased size of df: {round(df_mem_change/1024/1024/1024, 2)} GB and increased observations: {round(df_shape_change, 1)} million."
    )

    print("-" * 71)

    return df_final

In [12]:
df_sales = df_sales.drop(columns=["onpromotion", "year"])

df_sales = filling_dates_NaN(df_sales)

First date in df: 2013-01-01 00:00:00
Last date in df:  2017-08-15 00:00:00
-----------------------------------------------------------------------
Start size of df:     1.99 GB and start observations:     125.5 million.
Final size of df:     5.82 GB and end observations:       367.9 million.
Change in size of df: 193.15 % and observations:           193.15     %.
Increased size of df: 3.84 GB and increased observations: 242.4 million.
-----------------------------------------------------------------------


In [13]:
# df_sales = df_sales[(df_sales["store_nbr"] == 1)]

df_sales = df_sales[
    df_sales["store_nbr"].isin(
        [
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20,
            21,
            22,
            23,
            24,
            25,
        ]
    )
]

In [None]:
df_sales = df_sales[(df_sales["date"] < "2015-07-01")]

# 5.1 testing merge_store_status function

idea: use the same function/logic to label a item_nbr as new product for step 4.4

To-do: discuss do we need to inpute all stores on 01-01-2013 with 0, as almost all the stores are  (only 25=open) closed by result of NYE/1jan

In [15]:
def merge_store_status(df):

    # Label Variable for atributing numbers to store status, to save memory in df
    OPEN = 0
    NEW = 2
    CLOSED = 4
    OLD = 6
    NEVER_OPENED = 8

    # Group by store and date, then sum sales
    df_grouped = (
        df.groupby(["store_nbr", "date"])["unit_sales"]
        .sum()  # --> sum or agg ?
        .reset_index()
    )

    # Sort by store and date
    df_grouped = df_grouped.sort_values(["store_nbr", "date"])

    # Create a new column for store status, label al stores as 'open' by default
    df_grouped["store_status"] = OPEN

    # Find the first and last day with sales for each store
    first_sale_date = (
        df_grouped[df_grouped["unit_sales"] > 0].groupby("store_nbr")["date"].min()
    )

    last_sale_date = (
        df_grouped[df_grouped["unit_sales"] > 0].groupby("store_nbr")["date"].max()
    )

    # Loop trhough stores by lapeling them as 'NEW', 'CLOSED', 'OLD' or 'NEVER_OPENED' based on first sale date and last sale date
    for store in df_grouped["store_nbr"].unique():
        store_data = df_grouped[df_grouped["store_nbr"] == store]

        if store in first_sale_date.index:
            first_date = first_sale_date[store]
            last_date = last_sale_date[store]

            # Mark as 'NEW' before first sale date
            df_grouped.loc[
                (df_grouped["store_nbr"] == store) & (df_grouped["date"] < first_date),
                "store_status",
            ] = NEW
            # --> To-do: Do we call this  not opened' or a 'new store'?

            # Mark as 'closed' after first sale date if sales are 0
            df_grouped.loc[
                (df_grouped["store_nbr"] == store)
                & (df_grouped["date"] > first_date)
                & (df_grouped["unit_sales"] == 0),
                "store_status",
            ] = CLOSED

            # Mark as 'OLD' after last sale date
            df_grouped.loc[
                (df_grouped["store_nbr"] == store) & (df_grouped["date"] > last_date),
                "store_status",
            ] = OLD

        else:
            # If a store never had any sales, mark all dates as 'NEVER_OPENED' --> no records?
            df_grouped.loc[df_grouped["store_nbr"] == store, "store_status"] = (
                NEVER_OPENED
            )

    # Change the data type of store_status column to int8
    df_grouped["store_status"] = df_grouped["store_status"].astype("int8")

    # Merging store_status on df_sales
    df = df.merge(
        df_grouped[["store_nbr", "date", "store_status"]],
        left_on=["store_nbr", "date"],
        right_on=["store_nbr", "date"],
        how="left",
    )

    # Using a mask to flag al 'CLOSED'=4 stores and impute 'closed' stores with 0, not opened stores stay NA/NaN
    mask = df["store_status"] == 4
    df.loc[mask, "unit_sales"] = 0

    print("-" * 72)
    print(
        f"Size of df:     {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB and end observations:       {round(df.shape[0] / 1e6, 1)} million."
    )
    print("- " * 36)
    print("df_grouped store_status value counts:")
    print(df_grouped["store_status"].value_counts())

    print("-" * 72)

    return df

To-do: discuss do we need to inpute all stores on 01-01-2013 with 0, as almost all the stores are  (only 25=open) closed by result of NYE/1jan

To-do: think if it also a possibility to label the closed stores on 25-12 and 01-01 as Closed_All

In [None]:
df_sales_grouped.sort_values(by=["date", "store_nbr"]).head(60)

In [None]:
df_sales_grouped_check = df_sales_grouped[
    (df_sales_grouped["store_status"] == "4")
].sort_values(by=["date", "store_nbr"])


df_sales_grouped_check.head(30)

#TO-DO: think about if it is needed to drop the 'store_status'-column after this inpute? or keep it as a feature?

In [None]:
df_sales_check = df_sales[(df_sales["store_status"] == "closed")].sort_values(
    by=["date", "store_nbr"]
)

df_sales_check.sample(60)

junk code to select df based on store or specific item

In [None]:
df_sales_2013_nan_item_nbr = df_0[
    (df_0["store_nbr"].isin([53, 6])) &
    & (df_0["item_nbr"].isin([627887, 759890, 1160872]))
].drop(columns=["id"])

df_final_check2 = df_sales_nan[(df_sales_nan["date"] > "2017-31-12")].sort_values(
    by=["date", "store_nbr", "item_nbr"]
)

df_final_check2.head(60)

In [None]:
df_sales.info()

### Final function of merge_store_status

IDEA: transform all to int32 instead of int64 dtype

IDea: check newly created columns if can transform int dtype to uint8 or int16 to save memory

-------------------------------

Label Variable for atributing numbers to store status, to save memory in df:
-     OPEN = 0
-     NEW = 2
-     CLOSED = 4
-     OLD = 6
-     NEVER_OPENED = 8

----------

To-do: discuss do we need to inpute all stores on 01-01-2013 with 0, as almost all the stores are  (only 25=open) closed by result of NYE/1jan

In [15]:
def merge_store_status(df):

    # Label Variable for atributing numbers to store status, to save memory in df
    OPEN = 0
    NEW = 2
    CLOSED = 4
    OLD = 6
    NEVER_OPENED = 8

    # Group by store and date, then sum sales
    df_grouped = (
        df.groupby(["store_nbr", "date"]).agg({"unit_sales": "sum"}).reset_index()
    ).reset_index()

    # Sort by store and date
    df_grouped = df_grouped.sort_values(["store_nbr", "date"])

    # Create a new column for store status, label al stores as 'open' by default and make dtype in8
    df_grouped["store_status"] = np.int8(OPEN)

    # Find the first and last day with sales for each store
    first_sale_date = (
        df_grouped[df_grouped["unit_sales"] > 0].groupby("store_nbr")["date"].min()
    )

    last_sale_date = (
        df_grouped[df_grouped["unit_sales"] > 0].groupby("store_nbr")["date"].max()
    )

    # Loop trhough stores by lapeling them as 'NEW', 'CLOSED', 'OLD' or 'NEVER_OPENED' based on first sale date and last sale date
    for store in df_grouped["store_nbr"].unique():
        store_data = df_grouped[df_grouped["store_nbr"] == store]

        if store in first_sale_date.index:
            first_date = first_sale_date[store]
            last_date = last_sale_date[store]

            # Mark as 'NEW' before first sale date
            df_grouped.loc[
                (df_grouped["store_nbr"] == store) & (df_grouped["date"] < first_date),
                "store_status",
            ] = NEW
            # --> To-do: Do we call this  not opened' or a 'new store'?

            # Mark as 'closed' after first sale date if sales are 0
            df_grouped.loc[
                (df_grouped["store_nbr"] == store)
                & (df_grouped["date"] > first_date)
                & (df_grouped["unit_sales"] == 0),
                "store_status",
            ] = CLOSED

            # Mark as 'OLD' after last sale date
            df_grouped.loc[
                (df_grouped["store_nbr"] == store) & (df_grouped["date"] > last_date),
                "store_status",
            ] = OLD

        else:
            # If a store never had any sales, mark all dates as 'NEVER_OPENED' --> no records?
            df_grouped.loc[df_grouped["store_nbr"] == store, "store_status"] = (
                NEVER_OPENED
            )

    # Merging store_status on df_sales
    df = df.merge(
        df_grouped[["store_nbr", "date", "store_status"]],
        left_on=["store_nbr", "date"],
        right_on=["store_nbr", "date"],
        how="left",
    )

    # Using a mask to flag al 'CLOSED'=4 stores and impute 'closed' stores with 0, not opened stores stay NA/NaN
    mask = df["store_status"] == 4
    df.loc[mask, "unit_sales"] = 0

    print("-" * 72)
    print(
        f"Size of df:     {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB and end observations:       {round(df.shape[0] / 1e6, 1)} million."
    )
    print("- " * 36)
    print("df_grouped store_status value counts:")
    print(df_grouped["store_status"].value_counts())

    print("-" * 72)

    return df

In [None]:
df_sales = merge_store_status(df_sales)

In [None]:
df_sales.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_store_status_over_time(df):
    # Convert 'date' column to datetime if it's not already in that format
    if df["date"].dtype == "object":
        df["date"] = pd.to_datetime(df["date"])

    # Map store_status to descriptive labels
    status_labels = {0: "OPEN", 2: "NEW", 4: "CLOSED", 6: "OLD", 8: "NEVER_OPENED"}
    df["store_status_label"] = df["store_status"].map(status_labels)

    # Plotting the results
    plt.figure(figsize=(14, 8))
    sns.lineplot(
        data=df, x="date", y="store_nbr", hue="store_status_label", palette="tab10"
    )

    plt.title("Store Status Over Time")
    plt.xlabel("Date")
    plt.ylabel("Store Number")
    plt.legend(title="Store Status", loc="upper right")
    plt.grid(True)
    plt.show()

In [None]:
plot_store_status_over_time(df_sales)

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates


def plot_store_status(df):
    # Ensure that 'date' is a datetime object
    df["date"] = pd.to_datetime(df["date"])

    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(12, 6))

    # Map store status to meaningful names for better labeling
    status_labels = {0: "Open", 2: "New", 4: "Closed", 6: "Old", 8: "Never Opened"}

    # Iterating through each store to plot their status over time
    for store in df["store_nbr"].unique():
        store_data = df[df["store_nbr"] == store]
        ax.plot(
            store_data["date"],
            store_data["store_status"],
            marker="o",
            label=f"Store {store}",
        )

    # Format the x-axis to show dates properly
    ax.xaxis.set_major_locator(mdates.MonthLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
    plt.xticks(rotation=45)

    # Set labels and title
    ax.set_ylabel("Store Status")
    ax.set_xlabel("Date")
    ax.set_title("Store Status Over Time")
    ax.set_yticks(list(status_labels.keys()))
    ax.set_yticklabels(status_labels.values())

    # Add a legend
    ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left")

    # Show grid
    ax.grid(True)

    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
plot_store_status(df_sales)

In [21]:
import pandas as pd
import matplotlib.pyplot as plt


def plot_store_status_area(df):
    # Ensure that 'date' is a datetime object
    df["date"] = pd.to_datetime(df["date"])

    # Create a pivot table for plotting
    pivot_df = df.pivot_table(
        index="date", columns="store_nbr", values="store_status", aggfunc="first"
    )

    # Create an area plot
    plt.figure(figsize=(12, 6))
    plt.stackplot(pivot_df.index, pivot_df.T, labels=pivot_df.columns, alpha=0.5)

    # Set labels and title
    plt.ylabel("Store Status")
    plt.xlabel("Date")
    plt.title("Store Status Over Time (Area Plot)")
    plt.yticks(list(status_labels.keys()), list(status_labels.values()))

    # Add a legend
    plt.legend(title="Store Number", bbox_to_anchor=(1.05, 1), loc="upper left")

    # Show grid
    plt.grid(True)

    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
plot_store_status_area(df_sales)

In [None]:
df_sales.head(30)

In [None]:
df_sales.sample(60)

# 6 Define New product for step 4.4

•	Before the very first sale of an item, all observations are kept as NA

•	After the very first sale of an item, we go to step 3:  

 Sum/Agg all sales group  by item, date
 --> Sum_sales > 0 then first_sales_day of product
 else <first_sales_day of product --> delete unit_sales --> N/A

 -----------------------------------

# Label Variable for atributing numbers to store status, to save memory in df
-     EXISTING = 1
-     NEW = 3
-     #TEMPORARY_NOT_SOLD = 5 #---> Myabe to label delivery problems? for longer periods of stockout
-     OLD = 7
-     NEVER_SOLD = 9

In [None]:
df_sales.drop(columns=["store_status"])

In [19]:
def merge_item_status(df):

    start_time = time.time()  # Record the start time of the function

    # Label Variable for atributing numbers to store status, to save memory in df
    EXISTING = 1
    NEW = 3
    # TEMPORARY_NOT_SOLD = 5 #---> Myabe to label delivery problems? for longer periods of stockout
    OLD = 7
    NEVER_SOLD = 9

    # Sort by store, item, and date --> is this needed???
    df_grouped = df.sort_values(["store_nbr", "item_nbr", "date"])

    # Create a new column for item status, label all items as 'existing' by default
    df_grouped["item_status"] = np.int8(EXISTING)

    # Change the data type of store_status column to int8
    df_grouped["item_status"] = df_grouped["item_status"].astype("int8")

    # Find the first and last day with sales for each item per store
    first_sale_date = (
        df_grouped[df_grouped["unit_sales"] > 0]
        .groupby(["store_nbr", "item_nbr"])["date"]
        .min()
    )

    last_sale_date = (
        df_grouped[df_grouped["unit_sales"] > 0]
        .groupby(["store_nbr", "item_nbr"])["date"]
        .max()
    )

    # # Loop through store-item combinations and label them as 'new' or 'old' based on first and last sale date
    for store in df_grouped["store_nbr"].unique():

        print(
            f"Processing current store number: {store} | Elapsed time: {time.time() - start_time:.2f} seconds"
        )

        for item in df_grouped[df_grouped["store_nbr"] == store]["item_nbr"].unique():

            store_item_data = df_grouped[
                (df_grouped["store_nbr"] == store) & (df_grouped["item_nbr"] == item)
            ]

            if (store, item) in first_sale_date.index:
                first_date = first_sale_date[(store, item)]
                last_date = last_sale_date[(store, item)]

                # Mark as 'NEW' before first sale date
                df_grouped.loc[
                    (df_grouped["store_nbr"] == store)
                    & (df_grouped["item_nbr"] == item)
                    & (df_grouped["date"] < first_date),
                    "item_status",
                ] = NEW

                # Mark as 'OLD' after last sale date
                df_grouped.loc[
                    (df_grouped["store_nbr"] == store)
                    & (df_grouped["item_nbr"] == item)
                    & (df_grouped["date"] > last_date),
                    "item_status",
                ] = OLD

            else:
                # Mark as 'NEVER_SOLD' If an item never had any sales
                df_grouped.loc[
                    (df_grouped["store_nbr"] == store)
                    & (df_grouped["item_nbr"] == item),
                    "item_status",
                ] = NEVER_SOLD  # --> DELETE? ITEM FROM STORE? example item_nbr 2011451 from store_nbr 6
                # to-do: investigate if never_sold items are created beceause of NaN function

    # Merging item_status on df
    df = df.merge(
        df_grouped[["store_nbr", "item_nbr", "date", "item_status"]],
        left_on=["store_nbr", "item_nbr", "date"],
        right_on=["store_nbr", "item_nbr", "date"],
        how="left",
    )

    print("-" * 72)
    print(
        f"Size of df:     {round(sys.getsizeof(df)/1024/1024/1024, 2)} GB and end observations:       {round(df.shape[0] / 1e6, 1)} million."
    )
    print("- " * 36)
    print(f"Total execution time: {(time.time() - start_time) / 60 / 60:.2f} hours")
    print("-" * 72)

    return df

In [20]:
df_test = merge_item_status(df_sales)

Elapsed time: 181.24 seconds | LINE | df_grouped = df.sort_values([store_nbr, item_nbr, date]) |
Elapsed time: 181.41 seconds | LINE | df_grouped[item_status = np.int8(EXISTING) |
Elapsed time: 181.69 seconds | LINE | df_grouped[item_status] = df_grouped[item_status].astype(int8) |
Elapsed time: 236.51 seconds | LINE | first_sale_date |
Elapsed time: 270.94 seconds | LINE | last_sale_date |
------------------------------------------------------------------------
Size of df:     5.82 GB and end observations:       367.9 million.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Total execution time: 0.08 hours
------------------------------------------------------------------------


- Processing current store number: 1 | Elapsed time: 15.05 seconds
- Processing current store number: 2 | Elapsed time: 2210.48 seconds
- Processing current store number: 3 | Elapsed time: 4455.33 seconds
- Processing current store number: 4 | Elapsed time: 6553.91 seconds
- Processing current store number: 5 | Elapsed time: 8607.63 seconds
- Processing current store number: 6 | Elapsed time: 10650.11 seconds
------------------------------------------------------------------------
- Size of df:     0.95 GB and end observations:       40.9 million.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
- Total execution time: 12732.79 seconds
------------------------------------------------------------------------


In [18]:
# very time consuming function! takes sometimes more then 2000 minutes per store

df_sales_1_2 = df_sales[df_sales["store_nbr"].isin([1, 2])]

df_sales_1_2 = merge_item_status(df_sales_1_2)

Elapsed time: 2.89 seconds | LINE | df_grouped = df.sort_values([store_nbr, item_nbr, date]) |
Elapsed time: 2.89 seconds | LINE | df_grouped[item_status = np.int8(EXISTING) |
Elapsed time: 2.91 seconds | LINE | df_grouped[item_status] = df_grouped[item_status].astype(int8) |
Elapsed time: 3.49 seconds | LINE | first_sale_date |
Elapsed time: 4.11 seconds | LINE | last_sale_date |
Processing current store number: 1 | Elapsed time: 4.21 seconds
Store 1 | Item 96995 | Elapsed time: 4.62 seconds | LINE | store_item_data |
Store 1 | Item 99197 | Elapsed time: 4.85 seconds | LINE | store_item_data |
Store 1 | Item 103501 | Elapsed time: 5.08 seconds | LINE | store_item_data |
Store 1 | Item 103520 | Elapsed time: 5.16 seconds | LINE | store_item_data |
Store 1 | Item 103665 | Elapsed time: 5.40 seconds | LINE | store_item_data |
Store 1 | Item 105574 | Elapsed time: 5.64 seconds | LINE | store_item_data |
Store 1 | Item 105575 | Elapsed time: 5.86 seconds | LINE | store_item_data |
Store 1 

In [15]:
# Function to print memory usage of DataFrames
def print_memory_usage(dataframes):
    for name, df in dataframes.items():
        mem_usage = df.memory_usage(deep=True)
        total_mem = mem_usage.sum()
        print(f"DataFrame: {name}")
        print(mem_usage)
        print(f"Total Memory Usage: {total_mem} bytes\n")


# Check for DataFrames
dataframes = {
    name: obj for name, obj in globals().items() if isinstance(obj, pd.DataFrame)
}
print_memory_usage(dataframes)

DataFrame: df_sales
Index                132
store_nbr      367889472
item_nbr      1471557888
date          2943115776
unit_sales    1471557888
dtype: int64
Total Memory Usage: 6254121156 bytes

DataFrame: df_holidays
Index            132
date            2800
type             860
locale           626
locale_name     2284
description    11870
transferred      350
dtype: int64
Total Memory Usage: 18922 bytes

DataFrame: df_items
Index           132
item_nbr      16400
family         7144
class          8200
perishable     4100
dtype: int64
Total Memory Usage: 35976 bytes

DataFrame: df_stores
Index         132
store_nbr      54
city         1845
state        1539
type          476
cluster        54
dtype: int64
Total Memory Usage: 4100 bytes



# 6.X Polars

In [14]:
import polars as pl


def merge_item_status_polars(df_pandas):

    start_time = time.time()  # Record the start time of the function

    # Label variables
    EXISTING = 1
    NEW = 3
    OLD = 7
    NEVER_SOLD = 9

    # Convert the Pandas DataFrame to Polars
    df = pl.from_pandas(df_pandas)

    # Sort by store, item, and date
    df = df.sort(["store_nbr", "item_nbr", "date"])

    print(f"Elapsed time: {time.time() - start_time:.2f} seconds | LINE | df sorted |")

    # Create a new column for item status, initialise to EXISTING
    df = df.with_columns(pl.lit(EXISTING).cast(pl.Int8).alias("item_status"))

    print(
        f"Elapsed time: {time.time() - start_time:.2f} seconds | LINE | item_status added |"
    )

    # Filter for rows with unit_sales > 0 and calculate first/last sale dates
    first_sale_date = (
        df.filter(pl.col("unit_sales") > 0)
        .group_by(["store_nbr", "item_nbr"])
        .agg([pl.col("date").min().alias("first_sale_date")])
    )

    last_sale_date = (
        df.filter(pl.col("unit_sales") > 0)
        .group_by(["store_nbr", "item_nbr"])
        .agg([pl.col("date").max().alias("last_sale_date")])
    )

    print(
        f"Elapsed time: {time.time() - start_time:.2f} seconds | LINE | first and last sale dates |"
    )

    # Join first and last sale dates to the original dataframe
    df = df.join(first_sale_date, on=["store_nbr", "item_nbr"], how="left")
    df = df.join(last_sale_date, on=["store_nbr", "item_nbr"], how="left")

    print(
        f"Elapsed time: {time.time() - start_time:.2f} seconds | LINE | joined sale dates |"
    )

    # Update the item_status column based on first and last sale dates
    df = df.with_columns(
        pl.when(pl.col("date") < pl.col("first_sale_date"))
        .then(pl.lit(NEW))
        .when(pl.col("date") > pl.col("last_sale_date"))
        .then(pl.lit(OLD))
        .otherwise(pl.col("item_status"))
        .alias("item_status")
    )

    # Handle NEVER_SOLD case where first_sale_date is null
    df = df.with_columns(
        pl.when(pl.col("first_sale_date").is_null())
        .then(pl.lit(NEVER_SOLD))
        .otherwise(pl.col("item_status"))
        .alias("item_status")
    )

    print(
        f"Elapsed time: {time.time() - start_time:.2f} seconds | LINE | updated item status |"
    )

    df = df.drop(columns=["first_sale_date", "last_sale_date"])

    # Convert back to Pandas DataFrame if needed
    df_pandas_final = df.to_pandas()

    print("-" * 72)
    print(f"Total execution time: {(time.time() - start_time) / 60:.2f} minutes")
    print("-" * 72)

    return df_pandas_final

In [15]:
df_sales = merge_item_status_polars(df_sales)

Elapsed time: 23.24 seconds | LINE | df sorted |
Elapsed time: 23.33 seconds | LINE | item_status added |
Elapsed time: 28.64 seconds | LINE | first and last sale dates |
Elapsed time: 61.00 seconds | LINE | joined sale dates |
Elapsed time: 66.10 seconds | LINE | updated item status |
------------------------------------------------------------------------
Total execution time: 1.22 minutes
------------------------------------------------------------------------


In [18]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170319200 entries, 0 to 170319199
Data columns (total 7 columns):
 #   Column           Dtype         
---  ------           -----         
 0   store_nbr        uint8         
 1   item_nbr         uint32        
 2   date             datetime64[ns]
 3   unit_sales       float32       
 4   item_status      int8          
 5   first_sale_date  datetime64[ns]
 6   last_sale_date   datetime64[ns]
dtypes: datetime64[ns](3), float32(1), int8(1), uint32(1), uint8(1)
memory usage: 5.4 GB


In [16]:
df_sales.head(20)

Unnamed: 0,store_nbr,item_nbr,date,unit_sales,item_status,first_sale_date,last_sale_date
0,1,96995,2013-01-01,,3,2013-01-10,2017-08-10
1,1,96995,2013-01-02,,3,2013-01-10,2017-08-10
2,1,96995,2013-01-03,,3,2013-01-10,2017-08-10
3,1,96995,2013-01-04,,3,2013-01-10,2017-08-10
4,1,96995,2013-01-05,,3,2013-01-10,2017-08-10
5,1,96995,2013-01-06,,3,2013-01-10,2017-08-10
6,1,96995,2013-01-07,,3,2013-01-10,2017-08-10
7,1,96995,2013-01-08,,3,2013-01-10,2017-08-10
8,1,96995,2013-01-09,,3,2013-01-10,2017-08-10
9,1,96995,2013-01-10,1.0,1,2013-01-10,2017-08-10


# 7. Stockout on store level for 4.8 for perishable and non-perishable items

•      Perishable good: when there are missing values for two consecutive days for a given item per individual store 

•      Nonperishable goods: when there are missing values for 7 consecutive days for a given item and per individual store

•      Action: Impute with algorithm 

In [None]:
#perishable good
if ['perishable'] == 1 and item_missing_count = 1  #-->  inpute with 0

if ['perishable'] == 1 and item_missing_count > 2  #-->  inpute with 0

if ['perishable'] == 1 and item_missing_count <= 2 #-->  inpute with mean? or intrapolate?


    
#non-perishable good  
if ['perishable'] == 0 and and item_missing_count > 7 #-->  inpute with 0?

if ['perishable'] == 0 and item_missing_count <= 7 #-->  inpute with mean? or intrapolate?

First function to impute stockouts, first needed to merge with item for perishable column

In [15]:
df_sales = df_sales.merge(df_items, left_on="item_nbr", right_on="item_nbr", how="left")

df_sales = df_sales.drop(columns=["family", "class"])

In [None]:
df_sales.info()

In [None]:
# Pandas

# --------------------------------------------------------------------------------------------


def impute_stockouts(df):

    # Sort the dataframe by date, store_nbr, and item_nbr

    df = df.sort_values(["store_nbr", "item_nbr", "date"])

    # Group by store_nbr and item_nbr

    df = df.groupby(["store_nbr", "item_nbr"])

    # Nested function inpute_group

    def impute_group(group):

        # Calculate the number of consecutive missing days

        group["missing_count"] = (
            group["unit_sales"]
            .isnull()  # checks for null values in unit_sales
            .astype(int)
            .groupby(
                (group["unit_sales"].notnull().astype(int).diff() != 0).cumsum()
            )  # First occurrence, so each streak of consecutive nulls or non-nulls gets a unique number
            .cumsum()  # Counts consecutive nulls within each group
        )

        # Imputation logic for perishable goods if = 1

        perishable_mask = group["perishable"] == 1

        group.loc[perishable_mask & (group["missing_count"] == 1), "unit_sales"] = 0

        group.loc[perishable_mask & (group["missing_count"] > 2), "unit_sales"] = 0

        group.loc[perishable_mask & (group["missing_count"] == 2), "unit_sales"] = (
            group.loc[perishable_mask, "unit_sales"].interpolate()  # .interpolate()!!!
        )

        # Imputation logic for non-perishable goods if = 0

        non_perishable_mask = group["perishable"] == 0

        group.loc[non_perishable_mask & (group["missing_count"] > 7), "unit_sales"] = 0

        group.loc[non_perishable_mask & (group["missing_count"] <= 7), "unit_sales"] = (
            group.loc[
                non_perishable_mask, "unit_sales"
            ].interpolate()  # .interpolate()!!!
        )

        # Drop temporary missing count column

        group = group.drop("missing_count", axis=1)

        return group

    # Apply the imputation to each group

    imputed_df = df.apply(impute_group).reset_index(drop=True)

    return imputed_df

In [22]:
def impute_group(group):

    # Calculate the number of consecutive missing days
    group["missing_count"] = (
        group["unit_sales"]
        .isnull()  # checks for null values in unit_sales
        .astype(int)
        .groupby(
            (group["unit_sales"].notnull().astype(int).diff() != 0).cumsum()
        )  # First occurrence, so each streak of consecutive nulls or non-nulls gets a unique number
        .cumsum()  # Counts consecutive nulls within each group
    )

    # Imputation logic for perishable goods if 1
    perishable_mask = group["perishable"] == 1
    group.loc[perishable_mask & (group["missing_count"] == 1), "unit_sales"] = 0
    group.loc[perishable_mask & (group["missing_count"] > 2), "unit_sales"] = 0
    group.loc[perishable_mask & (group["missing_count"] == 2), "unit_sales"] = (
        group.loc[perishable_mask, "unit_sales"].interpolate()  # to-do: Interpolate ()
    )

    # Imputation logic for non-perishable goods if 0
    non_perishable_mask = group["perishable"] == 0
    group.loc[non_perishable_mask & (group["missing_count"] > 7), "unit_sales"] = 0
    group.loc[non_perishable_mask & (group["missing_count"] <= 7), "unit_sales"] = (
        group.loc[
            non_perishable_mask, "unit_sales"
        ].interpolate()  # To-do: Interpolate ()
    )

    # Drop temporary missing count column
    group = group.drop("missing_count", axis=1)

    return group

In [23]:
def impute_stockouts(df):
    # Sort the dataframe by date, store_nbr, and item_nbr
    df = df.sort_values(["store_nbr", "item_nbr", "date"])

    # Group by store_nbr and item_nbr
    df_grouped = df.groupby(["store_nbr", "item_nbr"])

    # Apply the imputation to each group
    imputed_df = df_grouped.apply(impute_group).reset_index(drop=True)

    return imputed_df

In [29]:
df_sales_test = df_sales[df_sales["item_nbr"].isin([103665, 1398687, 2011451])]

In [None]:
df_sales_test = impute_stockouts(df_sales_test)

In [None]:
df_sales_test.head(20)

In [None]:
df_sales_test.tail(20)

In [None]:
df_sales_test.sample(60)

In [None]:
df_sales_test.info()

In [None]:
# Interpolate between missing datapoints --> sales

fillna(method="mean")

df["column_name"].interpolate(method="linear", inplace=True)

df["column_name"].interpolate(method="time", inplace=True)

df["column_name"].interpolate(method="polynomial", order=2, inplace=True)

# Interpolate missing values for the 'unit_sales' column
df["unit_sales"] = df.groupby(["store_nbr", "item_nbr"])["unit_sales"].apply(
    lambda group: group.interpolate(method="linear")
)

# 8. Promotion Count

The number of days a item was on promotion 

In [None]:
# df["year"] = df["date"].dt.year
# df["month"] = df["date"].dt.month
# df["day"] = df["date"].dt.day

# Adding week number feature
df_sales["week_number"] = df_sales["date"].dt.isocalendar().week

In [14]:
import matplotlib.pyplot as plt
import seaborn as sns


# Example 3: Line plot of missing values over time
def plot_missing_values_time(df):
    missing_over_time = df.set_index("date")["onpromotion"].isnull().resample("D").sum()
    plt.figure(figsize=(12, 6))
    plt.plot(missing_over_time.index, missing_over_time.values)
    plt.title("Missing Values in onpromotion Column Over Time")
    plt.xlabel("Date")
    plt.ylabel("Number of Missing Values")
    plt.show()

In [None]:
df = df_sales

plot_missing_values_time(df)

In [None]:
df_sales.info()

# X.X df_sales_agg_week code to test

In [None]:
# Group the DataFrame by store number, item number, year, and week number, then aggregate the columns:
# --> "unit_sales","onpromotion", "national_hol","regional_hol","local_hol", "closed_hol",


def df_sales_agg_week(df):

    # Pre-sort the DataFrame, should be faster?

    df = df.sort_values(["store_nbr", "item_nbr", "year", "week_number"])

    df = (
        df.groupby(
            [
                "store_nbr",
                "item_nbr",
                "year",
                "week_number",
            ],
            sort=False,  # We've already sorted, so no need to sort again/double
        )
        .agg(
            {
                "unit_sales": "sum",
                "onpromotion": "sum",
                "national_hol": "sum",
                "regional_hol": "sum",
                "local_hol": "sum",
                "closed_hol": "sum",
            }
        )
        .reset_index()
    )

    return df

# 9. Closed Holiday Column Feature

The number of closed holidays within the week, expressed in percentages

# 10. Weekday feature

In [None]:
def weekday_weeknumber(df):

    # Ensure the date column is sorted
    df = df.sort_values("date")

    # Add column with ISO year
    df["year"] = df["date"].iso_calendar.year.astype("int16")

    # Add column with weekday (1-7, where 1 is Monday)
    df["weekday"] = (df["date"].dt.dayofweek + 1).astype("int8")

    # Add column with ISO week number (1-53)
    df["week_nbr"] = df["date"].dt.isocalendar().week.astype("int8")

    # Calculate the date of the Monday of the first week
    first_date = df["date"].iloc[0]
    days_to_last_monday = (first_date.weekday() - 0 + 7) % 7
    monday_first_week = first_date - pd.Timedelta(days=days_to_last_monday)

    # Calculate cumulative week numbers starting from the first Monday
    df["week_number_cum"] = (
        ((df["date"] - monday_first_week).dt.days // 7) + 1
    ).astype("int16")

    return df

# Check Start size of df_sales:     2.84 GB and start observations:     113.0 million -->  320.2 million rows