# Homework Starter — Stage 05: Data Storage
Name: 
Date: 

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [1]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> /mnt/data/data/raw
PROC -> /mnt/data/data/processed


## 1) Create or Load a Sample DataFrame
You may reuse data from prior stages or create a small synthetic dataset.

In [2]:
import numpy as np
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,149.496691
1,2024-01-02,AAPL,148.872103
2,2024-01-03,AAPL,147.779536
3,2024-01-04,AAPL,145.958061
4,2024-01-05,AAPL,146.360122


## 2) Save CSV to data/raw/ and Parquet to data/processed/ (TODO)
- Use timestamped filenames.
- Handle missing Parquet engine gracefully.

In [3]:

def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV ->", csv_path.resolve())

# Save Parquet (robust: engine may be missing)
pq_path = None
try:
    pq_path = PROC / f"sample_{ts()}.parquet"
    df.to_parquet(pq_path)  # requires pyarrow or fastparquet
    print("Saved Parquet ->", pq_path.resolve())
except Exception as e:
    print("Parquet save skipped (missing engine?). Tip: pip install pyarrow or fastparquet")
    print("Reason:", e)

csv_path, pq_path


Saved CSV -> /mnt/data/data/raw/sample_20250822-161136.csv
Parquet save skipped (missing engine?). Tip: pip install pyarrow or fastparquet
Reason: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.


(PosixPath('data/raw/sample_20250822-161136.csv'),
 PosixPath('data/processed/sample_20250822-161136.parquet'))

## 3) Reload and Validate (TODO)
- Compare shapes and key dtypes.

In [4]:

def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

# Reload CSV
df_csv = pd.read_csv(csv_path, parse_dates=['date'] if 'date' in df.columns else None)
csv_checks = validate_loaded(df, df_csv)
print("CSV validation:", csv_checks)

# Reload Parquet (if saved)
pq_checks = None
from pathlib import Path
if pq_path and Path(pq_path).exists():
    try:
        df_pq = pd.read_parquet(pq_path)
        pq_checks = validate_loaded(df, df_pq)
        print("Parquet validation:", pq_checks)
    except Exception as e:
        print("Parquet read skipped (missing engine?):", e)

csv_checks, pq_checks


CSV validation: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}


({'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True},
 None)

In [5]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        validate_loaded(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)

Parquet read failed: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.


## 4) Utilities (TODO)
- Implement `detect_format`, `write_df`, `read_df`.
- Use suffix to route; create parent dirs if needed; friendly errors for Parquet.

In [6]:

import typing as t, pathlib

def detect_format(path: t.Union[str, pathlib.Path]) -> str:
    s = str(path).lower()
    if s.endswith('.csv'):
        return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'):
        return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]) -> pathlib.Path:
    p = pathlib.Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    kind = detect_format(p)
    if kind == 'csv':
        df.to_csv(p, index=False)
        return p
    else:
        try:
            df.to_parquet(p)
            return p
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Run: pip install pyarrow  (or fastparquet)') from e

def read_df(path: t.Union[str, pathlib.Path]) -> pd.DataFrame:
    p = pathlib.Path(path)
    kind = detect_format(p)
    if kind == 'csv':
        # Try to infer date column if present
        try:
            return pd.read_csv(p, parse_dates=['date'])
        except Exception:
            return pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Run: pip install pyarrow  (or fastparquet)') from e

# Demo usage
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"

print("write_df ->", write_df(df, p_csv))
print("read_df head (csv) ->")
display(read_df(p_csv).head())

try:
    print("write_df ->", write_df(df, p_pq))
    print("read_df head (parquet) ->")
    display(read_df(p_pq).head())
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)


write_df -> data/raw/util_20250822-161136.csv
read_df head (csv) ->


Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,149.496691
1,2024-01-02,AAPL,148.872103
2,2024-01-03,AAPL,147.779536
3,2024-01-04,AAPL,145.958061
4,2024-01-05,AAPL,146.360122


Skipping Parquet util demo: Parquet engine not available. Run: pip install pyarrow  (or fastparquet)


## 5) Documentation (TODO)
- Update README with a **Data Storage** section (folders, formats, env usage).
- Summarize validation checks and any assumptions.