# üìä EDA Overview
Exploratory Data Analysis for the Favorita Grocery Sales dataset.

In [1]:
# ==============================================================
# üì¶ Setup & Imports
# ==============================================================
import sys
import warnings
from pathlib import Path

warnings.filterwarnings("ignore")  # global clean mode

# --------------------------------------------------------------
# 0) Bootstrap project root
# --------------------------------------------------------------
ROOT = Path().resolve()
PROJECT_ROOT = ROOT.parent if ROOT.name == "notebooks" else ROOT
sys.path.append(str(PROJECT_ROOT))


print(f"üì¶ Project root: {PROJECT_ROOT}")
import sys
from pathlib import Path
import pandas as pd
import plotly.express as px
import plotly.io as pio
import yaml

from src.data_loader import DataLoader, ensure_dir

# Plotly global
pio.templates.default = "plotly_dark"
pio.defaults.default_width = 900
pio.defaults.default_height = 500
pio.defaults.default_scale = 2

# Detect root
ROOT_CANDIDATE = Path().resolve()
PROJECT_ROOT = ROOT_CANDIDATE.parent if ROOT_CANDIDATE.name == "notebooks" else ROOT_CANDIDATE

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

DATA_DIR = PROJECT_ROOT / "data"
CONFIG_PATH = PROJECT_ROOT / "configs" / "data" / "active.yaml"

print(f"üìÇ Project root: {PROJECT_ROOT}")
print(f"üìÅ Data dir:    {DATA_DIR}")
print(f"‚öôÔ∏è Config path: {CONFIG_PATH}  (exists={CONFIG_PATH.exists()})")

loader = DataLoader(config_path=CONFIG_PATH)
print("‚úÖ DataLoader initialized.")

üì¶ Project root: /Users/kiko/Desktop/github/Corporacion-Favorita-Grocery-Sales-Forecasting
üìÇ Project root: /Users/kiko/Desktop/github/Corporacion-Favorita-Grocery-Sales-Forecasting
üìÅ Data dir:    /Users/kiko/Desktop/github/Corporacion-Favorita-Grocery-Sales-Forecasting/data
‚öôÔ∏è Config path: /Users/kiko/Desktop/github/Corporacion-Favorita-Grocery-Sales-Forecasting/configs/data/active.yaml  (exists=True)
‚úÖ DataLoader initialized.


In [2]:
from src.utils.files import ensure_dirs
# Verify and prepare
IMG_DIR = PROJECT_ROOT / "img" / "reports" / "eda_overview"
ensure_dirs(IMG_DIR)

PosixPath('/Users/kiko/Desktop/github/Corporacion-Favorita-Grocery-Sales-Forecasting/img/reports/eda_overview')

## üì• Load Train

In [3]:
print("üì• Loading train dataset‚Ä¶")
train = loader.load_dataset("train")
print(f"‚úÖ Loaded train: {len(train):,} rows √ó {len(train.columns)} columns")
train.head()

üì• Loading train dataset‚Ä¶
üì¶ Found 5 parts for train. Merging‚Ä¶
‚úÖ Loaded 103,857,647 rows √ó 6 cols.
‚úÖ Loaded train: 103,857,647 rows √ó 6 columns


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25.0,103665.0,7.0,
1,1,2013-01-01,25.0,105574.0,1.0,
2,2,2013-01-01,25.0,105575.0,2.0,
3,3,2013-01-01,25.0,108079.0,1.0,
4,4,2013-01-01,25.0,108701.0,1.0,


## üìÖ Date Range & Summary

In [4]:
print("üìÖ Date Range:")
if "date" in train.columns:
    print(train["date"].min(), "‚Üí", train["date"].max())

print("\nüî¢ Basic Summary:")
train.describe(include="all")

üìÖ Date Range:
2013-01-01 00:00:00 ‚Üí 2017-01-22 00:00:00

üî¢ Basic Summary:


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
count,103857600.0,103857647,103857600.0,103857600.0,103857600.0,82199995
unique,,,,,,2
top,,,,,,False
freq,,,,,,76874911
mean,51928820.0,2015-05-14 06:02:55.569385728,27.35018,933399.0,8.647428,
min,0.0,2013-01-01 00:00:00,1.0,96995.0,-15372.0,
25%,25964410.0,2014-06-25 00:00:00,12.0,511394.0,2.0,
50%,51928820.0,2015-07-21 00:00:00,28.0,936341.0,4.0,
75%,77893230.0,2016-05-01 00:00:00,43.0,1260242.0,9.0,
max,103857600.0,2017-01-22 00:00:00,54.0,2124052.0,89440.0,


## üï≥Ô∏è Missing Values

In [5]:
na = train.isna().sum()
na = na[na > 0].sort_values(ascending=False)
na

onpromotion    21657652
store_nbr             1
item_nbr              1
unit_sales            1
dtype: int64

## üè™ Store & Item Count

In [6]:
stores = train["store_nbr"].nunique()
items  = train["item_nbr"].nunique()

print(f"üè™ Unique Stores: {stores}")
print(f"üì¶ Unique Items:  {items}")

üè™ Unique Stores: 53
üì¶ Unique Items:  3900


## üì¶ Store Distribution

In [7]:
store_counts = train["store_nbr"].value_counts().sort_index()

fig = px.bar(
    x=store_counts.index,
    y=store_counts.values,
    labels={"x": "Store Number", "y": "Count"},
    title="Store Distribution (Aggregated)"
)
fig.show()
fig.write_image(str(IMG_DIR / 'stores_distribution.png'))

## üì¶ Item Distribution

In [18]:
# --- Top 30 items with metadata mapping (items.csv) ---

top_n = 30

# 1) Top-Items nach Anzahl der Zeilen im train-Set
item_counts = (
    train["item_nbr"]
    .value_counts()
    .rename_axis("item_nbr")
    .reset_index(name="count")
)

top_items = item_counts.head(top_n).copy()

# 2) Items-Metadaten laden
items = loader.load_dataset("items")
items_meta = items[["item_nbr", "family", "class", "perishable"]].drop_duplicates()

# 3) Join: Top-Items + Meta
top_items = top_items.merge(items_meta, on="item_nbr", how="left")

# 4) Label-Text f√ºr Y-Achse (Family + ID)
top_items["label"] = top_items.apply(
    lambda row: f"{row['family']} (#{int(row['item_nbr'])})"
    if pd.notna(row["family"])
    else f"Item #{int(row['item_nbr'])}",
    axis=1,
)

# 5) Sortieren f√ºr horizontales Bar-Chart
top_items = top_items.sort_values("count", ascending=True)

# Reihenfolge f√ºr die Achse fixieren
label_order = top_items["label"].tolist()

# 6) Plot ‚Äì nur links Labels, keine Text-Labels rechts
fig = px.bar(
    top_items,
    x="count",
    y="label",
    orientation="h",
    title=f"Top {top_n} Most Frequent Items (with Families)",
    labels={
        "count": "Number of Sales Records",
        "label": "Item (Family + ID)",
    },
    hover_data=["item_nbr", "family", "class", "perishable"],
)

# Y-Achse: alle Labels in exakt dieser Reihenfolge anzeigen
fig.update_yaxes(
    categoryorder="array",
    categoryarray=label_order,
    automargin=True,
)

# Plot h√∂her machen, damit wirklich JEDER Balken sein Label bekommt
bar_height = max(700, top_n * 28)  # 28 px pro Balken, min. 700 px
fig.update_layout(
    margin=dict(l=260, r=40, t=60, b=40),
    height=bar_height,
)

fig.show()
fig.write_image(str(IMG_DIR / "top_30_items.png"))
print("‚úÖ Saved top_30_items.png with all labels on the left.")

üì¶ Found 1 parts for items. Merging‚Ä¶
‚úÖ Loaded 4,100 rows √ó 4 cols.


‚úÖ Saved top_30_items.png with all labels on the left.


## üìà Unit Sales Distribution

In [9]:
item_counts = (
    train["item_nbr"]
    .value_counts()
    .rename_axis("item_nbr")
    .reset_index(name="n_rows")
)

fig = px.histogram(
    item_counts,
    x="n_rows",
    nbins=60,
    title="Distribution of Sales Records per Item",
    labels={"n_rows": "Number of Rows (Sales Records)"},
)
fig.update_layout(bargap=0.05)
fig.show()
fig.write_image(str(IMG_DIR / 'unit_sales_distribution.png'))

# üóìÔ∏è Sales by day of week

In [10]:
train['day_of_week'] = pd.to_datetime(train['date']).dt.day_name()
dow = train.groupby('day_of_week')['unit_sales'].mean().reindex([
    'Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'
]).reset_index()
fig = px.bar(dow, x='day_of_week', y='unit_sales', title='Average Sales by Day of Week', template='plotly_dark')
fig.show()
fig.write_image(str(IMG_DIR / 'avg_sales_by_dayofweek.png'))

# üè™ Store Performance

In [11]:
store_perf = train.groupby('store_nbr')['unit_sales'].mean().reset_index().sort_values('unit_sales', ascending=False)
fig = px.bar(store_perf.head(30), x='store_nbr', y='unit_sales', title='Top 30 Stores by Average Sales', template='plotly_dark')
fig.show()
fig.write_image(str(IMG_DIR / 'top_stores_avg_sales.png'))

# üî• Promotions & Sales

In [12]:
promo_df = train.groupby('onpromotion')['unit_sales'].mean().reset_index()
fig = px.bar(promo_df, x='onpromotion', y='unit_sales', title='Average Sales vs Promotion Status', template='plotly_dark')
fig.show()
fig.write_image(str(IMG_DIR / 'promo_vs_sales.png'))

## üìà Daily Total Sales Over Time

In [13]:
ts = train.groupby("date")["unit_sales"].sum().reset_index()

fig = px.line(ts, x="date", y="unit_sales", title="Daily Total Sales")
fig.show()
fig.write_image(str(IMG_DIR / 'total_sales_over_time.png'))