In [1]:
import os
from pathlib import Path

import plotly.express as px

from Favorita_TSA.data_loader import parquet_loader
from Favorita_TSA.dataset import Dataset
from Favorita_TSA.viz.ploty_export import save_all
from Favorita_TSA.viz.ploty_theme import set_plotly_theme

In [2]:
print("Current working directory:", os.getcwd())

PROJECT_ROOT = Path("..").resolve()

# Setze das Arbeitsverzeichnis auf das Hauptprojektverzeichnis
os.chdir(f"{PROJECT_ROOT}")

# Überprüfe, ob das Verzeichnis korrekt gesetzt wurde
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\Kiko\Desktop\Code\tutoring\Group-Work-Favorita-Forecasting\notebooks
Current working directory: C:\Users\Kiko\Desktop\Code\tutoring\Group-Work-Favorita-Forecasting


In [3]:
set_plotly_theme()

In [4]:
df_oil = parquet_loader(Dataset.OIL)
df_items = parquet_loader(Dataset.ITEMS)
df_holidays = parquet_loader(Dataset.HOLIDAYS_EVENTS)
df_stores = parquet_loader(Dataset.STORES)
df_transactions = parquet_loader(Dataset.TRANSACTIONS)
df_train = parquet_loader(Dataset.TRAIN)

BASE_DIR data\processed\train


In [5]:
df_daily = df_train.groupby("date", as_index=False)["unit_sales"].sum()

top_stores = df_train.groupby("store_nbr")["unit_sales"].sum().nlargest(20).index

df_top = df_train[df_train.store_nbr.isin(top_stores)]


df_top_daily = df_top.groupby(["date", "store_nbr"], as_index=False)["unit_sales"].sum()

df_weekly = (
    df_train.assign(week=lambda x: x["date"].dt.to_period("W").dt.start_time)
    .groupby("week", as_index=False)["unit_sales"]
    .sum()
)

df_store_daily = df_train.groupby(["store_nbr", "date"], as_index=False)[
    "unit_sales"
].sum()

df_promo = df_train.groupby(["date", "onpromotion"], as_index=False)["unit_sales"].sum()

In [None]:
fig = px.line(
    df_top_daily,
    x="date",
    y="unit_sales",
    color="store_nbr",
    title="Top Stores - Sales Over Time",
)

fig.show()
save_all(fig, "eda/Top_Stores_Over_Time", overwrite=True)

TypeError: save_all() takes 2 positional arguments but 3 were given

In [10]:
df_global = df_train.groupby("date", as_index=False)["unit_sales"].sum()

fig = px.line(
    df_global,
    x="date",
    y="unit_sales",
    title="Global Unit Sales Over Time",
)

fig.show()

In [15]:
px.line(df_weekly, x="week", y="unit_sales", title="Weekly Sales Trend")

In [18]:
px.density_heatmap(
    df_store_daily,
    x="date",
    y="store_nbr",
    z="unit_sales",
    title="Store Activity Heatmap",
)

In [22]:
px.box(
    df_promo,
    x="onpromotion",
    y="unit_sales",
    log_y=True,
    title="Promotion Impact on Daily Sales",
)

In [None]:
px.line(
    df_top_daily,
    x="date",
    y="unit_sales",
    color="store_nbr",
    title="Top 20 Stores - Sales Over Time",
)

In [8]:
df_yoy = (
    df_train.assign(year=df_train.date.dt.year, doy=df_train.date.dt.dayofyear)
    .groupby(["year", "doy"], as_index=False)["unit_sales"]
    .sum()
)

fig = px.line(
    df_yoy,
    x="doy",
    y="unit_sales",
    color="year",
    title="Year-over-Year Sales Patterns",
)

fig.show()

save_all(fig, "eda/year_over_year_sales_patterns")

In [25]:
df_weekday = (
    df_train.assign(weekday=df_train.date.dt.day_name())
    .groupby("weekday", as_index=False)["unit_sales"]
    .mean()
)

px.bar(df_weekday, x="weekday", y="unit_sales", title="Average Sales by Weekday")

In [26]:
df_store_stats = (
    df_train.groupby("store_nbr")["unit_sales"].agg(["mean", "std"]).reset_index()
)

px.scatter(
    df_store_stats,
    x="mean",
    y="std",
    log_x=True,
    log_y=True,
    title="Store Volume vs Volatility",
)

In [29]:
df_item_rank = (
    df_train.groupby("item_nbr")["unit_sales"]
    .sum()
    .sort_values(ascending=False)
    .reset_index(drop=True)
)

px.line(
    y=df_item_rank.values,
    log_y=True,
    title="Item Sales Long-Tail Distribution",
)

In [32]:
df_promo = df_train.groupby("onpromotion")["unit_sales"].mean().reset_index()

px.bar(df_promo, x="onpromotion", y="unit_sales", title="Average Sales: Promotion Lift")

In [35]:
df_naive = df_daily.copy()
df_naive["naive"] = df_naive.unit_sales.shift(7)

px.line(
    df_naive,
    x="date",
    y=["unit_sales", "naive"],
    title="Naive 7-Day Forecast Baseline",
)

In [36]:
df_roll = df_daily.copy()
df_roll["rolling_14"] = df_roll.unit_sales.rolling(14).mean()

px.line(
    df_roll,
    x="date",
    y=["unit_sales", "rolling_14"],
    title="Actual vs Rolling Mean",
)