In [2]:
#!/usr/bin/env python3
"""
Stage 05 — Data Storage Homework
Implements:
1. Save dataframe in CSV + Parquet formats using env-driven paths.
2. Reload + validate shapes & dtypes.
3. Utility functions write_df/read_df (routing by suffix).
4. Ensures dirs exist, handles missing engines, prints clear validation.
"""

import os
from datetime import datetime
import pandas as pd
from dotenv import load_dotenv

# -------------------------
# Setup: Load env + dirs
# -------------------------
load_dotenv()

DATA_DIR_RAW = os.getenv("DATA_DIR_RAW", "data/raw")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED", "data/processed")

os.makedirs(DATA_DIR_RAW, exist_ok=True)
os.makedirs(DATA_DIR_PROCESSED, exist_ok=True)

# -------------------------
# Sample DataFrame
# -------------------------
df = pd.DataFrame({
    "id": range(1, 6),
    "name": ["Alice", "Bob", "Charlie", "Diana", "Evan"],
    "score": [88.5, 92.0, 79.5, 85.0, 90.5],
    "passed": [True, True, False, True, True]
})

timestamp = datetime.now().strftime("%Y%m%d-%H%M")
csv_path = os.path.join(DATA_DIR_RAW, f"sample_{timestamp}.csv")
parquet_path = os.path.join(DATA_DIR_PROCESSED, f"sample_{timestamp}.parquet")

# -------------------------
# Utility Functions
# -------------------------
def write_df(df: pd.DataFrame, path: str) -> None:
    """Write DataFrame to CSV or Parquet based on file extension."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    suffix = os.path.splitext(path)[1].lower()

    if suffix == ".csv":
        df.to_csv(path, index=False)
        print(f"[OK] DataFrame written to CSV: {path}")
    elif suffix == ".parquet":
        try:
            df.to_parquet(path, index=False, engine="pyarrow")
            print(f"[OK] DataFrame written to Parquet: {path}")
        except ImportError:
            print("[ERROR] Parquet engine missing. Install 'pyarrow' or 'fastparquet'.")
    else:
        raise ValueError(f"Unsupported file extension: {suffix}")

def read_df(path: str) -> pd.DataFrame:
    """Read DataFrame from CSV or Parquet based on file extension."""
    suffix = os.path.splitext(path)[1].lower()

    if not os.path.exists(path):
        raise FileNotFoundError(f"[ERROR] File not found: {path}")

    if suffix == ".csv":
        return pd.read_csv(path)
    elif suffix == ".parquet":
        try:
            return pd.read_parquet(path, engine="pyarrow")
        except ImportError:
            raise ImportError("[ERROR] Parquet engine missing. Install 'pyarrow' or 'fastparquet'.")
    else:
        raise ValueError(f"Unsupported file extension: {suffix}")

# -------------------------
# Save Data
# -------------------------
write_df(df, csv_path)
write_df(df, parquet_path)

# -------------------------
# Reload + Validate
# -------------------------
df_csv = read_df(csv_path)
df_parquet = read_df(parquet_path)

def validate_frames(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    """Validate shape and dtypes consistency between two DataFrames."""
    print("\n[Validation Results]")
    if df1.shape == df2.shape:
        print(f"✔ Shapes match: {df1.shape}")
    else:
        print(f"✘ Shape mismatch: CSV {df1.shape}, Parquet {df2.shape}")

    mismatch = {col: (df1[col].dtype, df2[col].dtype)
                for col in df1.columns if df1[col].dtype != df2[col].dtype}

    if not mismatch:
        print("✔ Dtypes match for all columns")
    else:
        print("✘ Dtype mismatches:")
        for col, (d1, d2) in mismatch.items():
            print(f"  - {col}: CSV={d1}, Parquet={d2}")

validate_frames(df_csv, df_parquet)


[OK] DataFrame written to CSV: data/raw\sample_20250826-1004.csv
[OK] DataFrame written to Parquet: data/processed\sample_20250826-1004.parquet

[Validation Results]
✔ Shapes match: (5, 4)
✔ Dtypes match for all columns
