In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from Favorita_TSA.data_loader import parquet_loader
from Favorita_TSA.dataset import Dataset
from Favorita_TSA.viz.color_manager import ColorManager
from Favorita_TSA.viz.ploty_export import save_all
from Favorita_TSA.viz.ploty_theme import set_plotly_theme

In [2]:
import os
from pathlib import Path

print("Current working directory:", os.getcwd())

PROJECT_ROOT = Path("..").resolve()

# Setze das Arbeitsverzeichnis auf das Hauptprojektverzeichnis
os.chdir(f"{PROJECT_ROOT}")

# Überprüfe, ob das Verzeichnis korrekt gesetzt wurde
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\Kiko\Desktop\Code\tutoring\Group-Work-Favorita-Forecasting\notebooks
Current working directory: C:\Users\Kiko\Desktop\Code\tutoring\Group-Work-Favorita-Forecasting


In [3]:
set_plotly_theme()
c = ColorManager().get_colors()

In [4]:
df_oil = parquet_loader(Dataset.OIL)
df_items = parquet_loader(Dataset.ITEMS)
df_holidays = parquet_loader(Dataset.HOLIDAYS_EVENTS)
df_stores = parquet_loader(Dataset.STORES)
df_transactions = parquet_loader(Dataset.TRANSACTIONS)
df_train = parquet_loader(Dataset.TRAIN)

BASE_DIR data\processed\train


In [5]:
for element in Dataset:
    print(element, "\n", parquet_loader(element).head(), "\n")

Dataset.OIL 
          date  dcoilwtico
0  2013-01-01         NaN
1  2013-01-02       93.14
2  2013-01-03       92.97
3  2013-01-04       93.12
4  2013-01-07       93.20 

Dataset.ITEMS 
    item_nbr        family  class  perishable
0     96995     GROCERY I   1093           0
1     99197     GROCERY I   1067           0
2    103501      CLEANING   3008           0
3    103520     GROCERY I   1028           0
4    103665  BREAD/BAKERY   2712           1 

Dataset.HOLIDAYS_EVENTS 
          date     type    locale locale_name                    description  \
0  2012-03-02  Holiday     Local       Manta             Fundacion de Manta   
1  2012-04-01  Holiday  Regional    Cotopaxi  Provincializacion de Cotopaxi   
2  2012-04-12  Holiday     Local      Cuenca            Fundacion de Cuenca   
3  2012-04-14  Holiday     Local    Libertad      Cantonizacion de Libertad   
4  2012-04-21  Holiday     Local    Riobamba      Cantonizacion de Riobamba   

   transferred  
0        False  
1    

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125497040 entries, 0 to 125497039
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int16         
 3   item_nbr     int32         
 4   unit_sales   float64       
 5   onpromotion  boolean       
dtypes: boolean(1), datetime64[ns](1), float64(1), int16(1), int32(1), int64(1)
memory usage: 3.7 GB


In [6]:
sale_day_store_level = df_train.groupby(["date", "store_nbr"])["unit_sales"].sum()

In [7]:
sale_day_store_level = df_train.groupby(["date", "store_nbr"], as_index=False)[
    "unit_sales"
].sum()

In [10]:
df_time = sale_day_store_level.groupby("date", as_index=False)["unit_sales"].sum()

In [15]:
fig = px.line(
    sale_day_store_level,
    x="date",
    y="unit_sales",
    markers=True,
    title="Total Unit Sales over Time",
)

fig.update_layout(xaxis_title="Date", yaxis_title="Unit Sales")

fig.show()

In [9]:
df_oil["date"] = pd.to_datetime(df_oil["date"])
sales_oil = (
    df_train.groupby("date")["unit_sales"].sum().reset_index()
)  # Aggregate daily sales
sales_oil = sales_oil.merge(df_oil, on="date", how="left")

fig = go.Figure()

# Unit Sales (linke y-Achse)
fig.add_trace(
    go.Scatter(
        x=sales_oil["date"],
        y=sales_oil["unit_sales"],
        name="Total Unit Sales",
        mode="lines",
        opacity=1.0,
        yaxis="y1",
    )
)

# Oil Prices (rechte y-Achse)
fig.add_trace(
    go.Scatter(
        x=sales_oil["date"],
        y=sales_oil["dcoilwtico"],
        name="Oil Prices",
        mode="lines",
        opacity=1.0,
        yaxis="y2",
    )
)

fig.update_layout(
    title="Daily Sales vs Oil Prices",
    xaxis={"title": "Date"},
    yaxis={
        "title": "Total Unit Sales",
        "side": "left",
    },
    yaxis2={
        "title": "Oil Prices",
        "overlaying": "y",
        "side": "right",
    },
    legend={"x": 0.01, "y": 0.99},
)

fig.show()
save_all(fig, "eda/daily_sales_vs_oil_prices")  # (overwrite = True)

In [10]:
store_sales = df_train.groupby("store_nbr")["unit_sales"].sum().reset_index()

# Sort stores by sales
store_sales = store_sales.sort_values(by="unit_sales", ascending=False)

store_sales = store_sales.copy()
store_sales["category"] = "Other"

top5_idx = store_sales.nlargest(5, "unit_sales").index
bottom5_idx = store_sales.nsmallest(5, "unit_sales").index

store_sales.loc[top5_idx, "category"] = "Top 5"
store_sales.loc[bottom5_idx, "category"] = "Bottom 5"

fig = px.bar(
    store_sales,
    x="store_nbr",
    y="unit_sales",
    color="category",
    title="Total Unit Sales Per Store (Top 5 & Bottom 5 Highlighted)",
    color_discrete_map={
        "Top 5": c.forecast,
        "Bottom 5": c.anomaly,
        "Other": c.border,
    },
)

fig.update_layout(
    xaxis_title="Store Number",
    yaxis_title="Total Sales",
    xaxis_tickangle=-90,
)

fig.show()

In [11]:
df_train["year"] = df_train["date"].dt.year
df_train["month"] = df_train["date"].dt.month

monthly_sales_by_year = (
    df_train.groupby(["year", "month"])["unit_sales"].sum().reset_index()
)


fig = px.line(
    monthly_sales_by_year,
    x="month",
    y="unit_sales",
    color="year",
    markers=True,
    title="Monthly Sales Trend Across Years",
)

fig.update_layout(
    xaxis={
        "title": "Month",
        "tickmode": "linear",
        "tick0": 1,
        "dtick": 1,
    },
    yaxis={"title": "Total Units Sold"},
    legend_title_text="Year",
)

fig.show()

In [12]:
# Create a sequential month-year column for better visualization
df_train["year_month"] = df_train["date"].dt.to_period("M")  # Format: YYYY-MM

# Aggregate sales by year-month
monthly_sales = df_train.groupby("year_month")["unit_sales"].sum().reset_index()

# Convert year_month to string for plotting
monthly_sales["year_month"] = monthly_sales["year_month"].astype(str)

monthly_sales_plot = monthly_sales.copy()
monthly_sales_plot["year_month"] = pd.to_datetime(monthly_sales_plot["year_month"])

fig = px.line(
    monthly_sales_plot,
    x="year_month",
    y="unit_sales",
    markers=True,
    title="Consecutive Monthly Sales Trend Over Years",
)

fig.update_layout(
    xaxis_title="Year-Month",
    yaxis_title="Total Sales",
)

fig.show()

In [13]:
daily_sales = df_train.groupby("date")["unit_sales"].sum().reset_index()
daily_sales["rolling_avg"] = daily_sales["unit_sales"].rolling(window=30).mean()
daily_sales["month"] = daily_sales["date"].dt.month

# Aggregate sales by month
monthly_sales = daily_sales.groupby("month")["unit_sales"].mean().reset_index()

fig = px.line(
    monthly_sales,
    x="month",
    y="unit_sales",
    markers=True,
    title="Monthly Seasonality in Sales",
)

fig.update_layout(
    xaxis_title="Month",
    yaxis_title="Average Sales",
    xaxis={
        "tickmode": "array",
        "tickvals": list(range(1, 13)),
        "ticktext": [
            "Jan",
            "Feb",
            "Mar",
            "Apr",
            "May",
            "Jun",
            "Jul",
            "Aug",
            "Sep",
            "Oct",
            "Nov",
            "Dec",
        ],
    },
)


fig.update_yaxes(
    tickformat=",.0f",
    exponentformat="none",
    showexponent="none",
)

fig.show()

In [14]:
# Extract day of the week from the date (Monday=0, Sunday=6)
daily_sales["day_of_week"] = daily_sales["date"].dt.dayofweek

# Aggregate sales by day of the week
weekly_sales = daily_sales.groupby("day_of_week")["unit_sales"].mean().reset_index()

fig = px.line(
    weekly_sales,
    x="day_of_week",
    y="unit_sales",
    markers=True,
    title="Weekly Seasonality in Sales",
)

fig.update_layout(
    xaxis_title="Day of Week",
    yaxis_title="Average Sales",
    xaxis={
        "tickmode": "array",
        "tickvals": list(range(7)),
        "ticktext": ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"],
    },
)

# ✔️ Fix für große Zahlen (lesbare Achse)
fig.update_yaxes(
    tickformat=",.0f",
    exponentformat="none",
    showexponent="none",
)

fig.show()

In [18]:
from plotly.subplots import make_subplots
from statsmodels.tsa.seasonal import seasonal_decompose

# Decompose the time series
decomposition = seasonal_decompose(
    daily_sales.set_index("date")["unit_sales"], model="additive", period=365
)

# Zeitreihe vorbereiten (DatetimeIndex + sortiert)
ts = daily_sales.set_index("date")["unit_sales"].sort_index()

# Optional (falls Lücken existieren): ts = ts.asfreq("D").interpolate()

decomp = seasonal_decompose(ts, model="additive", period=365)

fig = make_subplots(
    rows=4,
    cols=1,
    shared_xaxes=True,
    subplot_titles=("unit_sales", "Trend", "Seasonal", "Residual"),
)

fig.add_trace(
    go.Scatter(x=ts.index, y=decomp.observed, mode="lines", name="Observed"),
    row=1,
    col=1,
)
fig.add_trace(
    go.Scatter(x=ts.index, y=decomp.trend, mode="lines", name="Trend"), row=2, col=1
)
fig.add_trace(
    go.Scatter(x=ts.index, y=decomp.seasonal, mode="lines", name="Seasonal"),
    row=3,
    col=1,
)
fig.add_trace(
    go.Scatter(x=ts.index, y=decomp.resid, mode="lines", name="Residual"), row=4, col=1
)

fig.update_layout(
    height=900,
    title="Seasonal Decomposition (Additive, period=365)",
    showlegend=False,
)

# große Zahlen besser lesbar
fig.update_yaxes(
    tickformat=",.0f", exponentformat="none", showexponent="none", row=1, col=1
)
fig.update_yaxes(
    tickformat=",.0f", exponentformat="none", showexponent="none", row=2, col=1
)

fig.show()