# Stage 06 â€” Data Preprocessing (homework06)

This notebook demonstrates cleaning pipeline with absolute paths in .env.

In [None]:
import os, sys
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

# Load .env
load_dotenv(Path('.env'))

RAW = Path(os.getenv("DATA_DIR_RAW"))
PROC = Path(os.getenv("DATA_DIR_PROCESSED"))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

print("RAW =", RAW)
print("PROC =", PROC)

## Load dataset (demo if none exists)

In [None]:
raw_csvs = sorted(RAW.glob("*.csv"))
if not raw_csvs:
    demo = pd.DataFrame({
        "id":[1,2,3,4,5],
        "category":["A","B","A","B","A"],
        "price":[10, None, 30, 40, None],
        "date": pd.date_range("2025-01-01", periods=5)
    })
    demo.to_csv(RAW / "demo_raw.csv", index=False)
    raw_csvs = [RAW / "demo_raw.csv"]

RAW_FILE = raw_csvs[0]
df = pd.read_csv(RAW_FILE, parse_dates=['date'] if 'date' in pd.read_csv(RAW_FILE, nrows=0).columns else None)
print("Loaded:", RAW_FILE)
df.head()

## Import cleaning utilities

In [None]:
if str(Path('.').resolve()) not in sys.path:
    sys.path.append(str(Path('.').resolve()))
from src.cleaning import fill_missing_median, drop_missing, normalize_data

## Apply cleaning pipeline

In [None]:
df_filled = fill_missing_median(df)
df_dropped = drop_missing(df_filled)
df_clean = normalize_data(df_dropped, method="zscore")

print("Shapes:", df.shape, df_filled.shape, df_dropped.shape, df_clean.shape)
df_clean.head()

## Save cleaned dataset

In [None]:
from datetime import datetime
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
out = PROC / f"cleaned_{ts}.csv"
df_clean.to_csv(out, index=False)
print("Saved ->", out)
out