In [1]:
# ==============================================================
# üìò 01_data_init.ipynb  (PyYAML version)
# Initial data preparation & verification
# - CSV ‚Üí Parquet (Split-Parts oder partitioniert)
# - Repariert YAML source.local_dir automatisch
# ==============================================================

# 0) Bootstrap project root (robust aus notebooks/ oder repo-root)
import sys
from pathlib import Path

ROOT_CANDIDATE = Path().resolve()
PROJECT_ROOT = ROOT_CANDIDATE.parent if ROOT_CANDIDATE.name == "notebooks" else ROOT_CANDIDATE

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print(f"üì¶ Bootstrapped project root: {PROJECT_ROOT}")

# 1) Imports & Theme
import os
import pandas as pd
import plotly.express as px
import plotly.io as pio
import yaml  # <-- PyYAML statt ruamel.yaml

from src.data_loader import DataLoader, ensure_dir

pio.templates.default = "plotly_dark"
pio.defaults.default_width  = 900
pio.defaults.default_height = 500
pio.defaults.default_scale  = 2

DATA_DIR    = PROJECT_ROOT / "data"
CONFIG_PATH = PROJECT_ROOT / "configs" / "data" / "active.yaml"

ensure_dir(DATA_DIR)
print(f"üìÅ Data directory: {DATA_DIR}")
print(f"‚öôÔ∏è Config file:    {CONFIG_PATH}  (exists={CONFIG_PATH.exists()})")

# 2) YAML source.local_dir automatisch korrigieren (PyYAML)
if not CONFIG_PATH.exists():
    raise FileNotFoundError(f"Config file not found: {CONFIG_PATH}")

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f) or {}

cfg.setdefault("source", {})
correct_path = str(DATA_DIR)

if str(cfg["source"].get("local_dir")) != correct_path:
    cfg["source"]["local_dir"] = correct_path
    with open(CONFIG_PATH, "w", encoding="utf-8") as f:
        # sort_keys=False, damit die Reihenfolge in etwa erhalten bleibt
        yaml.safe_dump(cfg, f, sort_keys=False, allow_unicode=True)
    print(f"üõ†Ô∏è YAML updated: source.local_dir = {correct_path}")
else:
    print("‚úÖ YAML path already correct.")

# 3) DataLoader initialisieren
loader = DataLoader(config_path=CONFIG_PATH)
print("‚úÖ DataLoader initialisiert.")

# Optional: Dateien vor dem Preprocessing listen
print("\nüßæ Files BEFORE preprocessing:")
for f in sorted(DATA_DIR.glob("*")):
    print("   ", f.name)

# 4) CSV ‚Üí Parquet (Split oder Partition) ausf√ºhren
print("\nüöÄ Running preprocessing (CSV ‚Üí Parquet SPLIT/partition)‚Ä¶")
loader.preprocess_csv_to_parquet()
print("‚úÖ Conversion complete.")

# 5) Erzeugte Parquet-Artefakte zeigen (Parts oder Partitionen)
print("\nüì¶ Generated Parquet files or folders:")
parquets    = sorted(DATA_DIR.glob("*.parquet"))
part_files  = sorted(DATA_DIR.glob("train_part*.parquet"))
partitions  = sorted(DATA_DIR.glob("train/**/*.parquet"))

if parquets:
    for f in parquets:
        size_mb = f.stat().st_size / 1e6
        print(f"   {f.name:<40} {size_mb:>8.1f} MB")
elif part_files:
    for f in part_files:
        size_mb = f.stat().st_size / 1e6
        print(f"   {f.name:<40} {size_mb:>8.1f} MB")
elif partitions:
    for f in partitions:
        size_mb = f.stat().st_size / 1e6
        print(f"   {f.relative_to(DATA_DIR):<40} {size_mb:>8.1f} MB")
else:
    print("   ‚ö†Ô∏è No Parquet files found. Check preprocessing configuration.")

# Hinweis, falls noch monolithisches train.parquet existiert
mono_train = DATA_DIR / "train.parquet"
if mono_train.exists():
    print("\n‚ö†Ô∏è NOTE: Found monolithic 'train.parquet'.")
    print("   Deine Config zielt auf SPLIT/Partition. L√∂sche 'train.parquet' einmalig und preprocess erneut,")
    print("   wenn du nur train_part*.parquet bzw. partitionierte Ordner willst.")

# 6) Train laden (Parts oder Partition), nur zur Sichtpr√ºfung
def _collect_train_paths() -> list[Path]:
    paths = sorted(DATA_DIR.glob("train_part*.parquet"))
    if not paths:
        paths = sorted(DATA_DIR.glob("train/**/*.parquet"))
    if not paths and (DATA_DIR / "train.parquet").exists():
        paths = [DATA_DIR / "train.parquet"]
    return paths

train_paths = _collect_train_paths()

if train_paths:
    label = "parts" if any(p.name.startswith("train_part") for p in train_paths) else "partitions/mono"
    print(f"\nüîó Found {len(train_paths)} train {label} ‚Üí concatenating preview ‚Ä¶")
    df_train = pd.concat([pd.read_parquet(p) for p in train_paths], ignore_index=True)
    print(f"‚úÖ Combined train: {len(df_train):,} rows √ó {len(df_train.columns)} columns")

    display(df_train.head(10))

    if "date" in df_train.columns:
        print(f"üìÖ Date range: {df_train['date'].min()} ‚Üí {df_train['date'].max()}")

    na_counts = df_train.isna().sum()
    if na_counts.any():
        print("\nüï≥Ô∏è Missing values per column (non-zero):")
        print(na_counts[na_counts > 0].sort_values(ascending=False))

    if "unit_sales" in df_train.columns:
        fig = px.histogram(df_train, x="unit_sales", nbins=60,
                           title="Unit Sales Distribution (combined train)")
        fig.show()

    # Dedup-Check nach Key-Kombination
    key_cols = [c for c in ["id", "date", "store_nbr"] if c in df_train.columns]
    if key_cols:
        dup_cnt = df_train.duplicated(subset=key_cols).sum()
        print(f"\nüßπ Dedup Check on {key_cols}: duplicates = {dup_cnt:,}")
        if dup_cnt == 0:
            print("‚úÖ No duplicate (id, date, store_nbr) combinations.")
        else:
            print("‚ùóDuplicates detected ‚Äî pr√ºfe preprocess.deduplicate/_global.")
else:
    print("\n‚ö†Ô∏è No train parts found. Pr√ºfe YAML (preprocess.partition_by / part_size_rows).")

# 7) Meta-Datasets kurz pr√ºfen
for name in ["stores", "items", "oil", "holidays_events", "transactions", "test", "sample_submission"]:
    f = DATA_DIR / f"{name}.parquet"
    if f.exists():
        df_meta = pd.read_parquet(f)
        print(f"\nüìÑ {name}.parquet ‚Üí {len(df_meta):,} rows √ó {len(df_meta.columns)} cols")
        display(df_meta.head())

# 8) Summary
print("\nüéØ Data initialization complete!")
print("Next: open `02_eda_overview.ipynb` oder nutze `DataLoader().load_train_data()` im Code.")

üì¶ Bootstrapped project root: /Users/kiko/Desktop/github/Corporacion-Favorita-Grocery-Sales-Forecasting




üìÅ Data directory: /Users/kiko/Desktop/github/Corporacion-Favorita-Grocery-Sales-Forecasting/data
‚öôÔ∏è Config file:    /Users/kiko/Desktop/github/Corporacion-Favorita-Grocery-Sales-Forecasting/configs/data/active.yaml  (exists=True)
‚úÖ YAML path already correct.
‚úÖ DataLoader initialisiert.

üßæ Files BEFORE preprocessing:
    holidays_events.csv
    holidays_events_part1.parquet
    items.csv
    items_part1.parquet
    oil.csv
    oil_part1.parquet
    sample_submission.csv
    sample_submission_part1.parquet
    stores.csv
    stores_part1.parquet
    test.csv
    test_part1.parquet
    train.csv
    train_part1.parquet
    train_part10.parquet
    train_part11.parquet
    train_part12.parquet
    train_part13.parquet
    train_part14.parquet
    train_part15.parquet
    train_part16.parquet
    train_part17.parquet
    train_part18.parquet
    train_part19.parquet
    train_part2.parquet
    train_part20.parquet
    train_part21.parquet
    train_part3.parquet
    train_part

KeyboardInterrupt: 