# Homework Starter — Stage 05: Data Storage
Name: Mingjia Jin
Date: 

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [11]:
import os # interacts with your operating system (e.g., get environment variables).
import pathlib # a modern and cleaner way to handle file paths.
from pathlib import Path
import datetime 
import pandas as pd
from dotenv import load_dotenv # loads environment variables from a .env file into Python’s environment.
import numpy as np

In [17]:
PROJECT_ROOT = Path("..").resolve() # use absolute path to make it more stable
ENV_PATH = PROJECT_ROOT / ".env"

load_dotenv(override=True) # cause of cached, always check momery first instead of checking disk, so always override
RAW = (PROJECT_ROOT / os.getenv("DATA_DIR_RAW", "data/raw")).resolve()
PROC = (PROJECT_ROOT / os.getenv("DATA_DIR_PROCESSED", "data/processed")).resolve()
if not str(RAW).startswith(str(PROJECT_ROOT)):
    raise ValueError(f"RAW path is outside project root! → {RAW}")
if not str(PROC).startswith(str(PROJECT_ROOT)):
    raise ValueError(f"RAW path is outside project root! → {PROC}")
RAW.mkdir(parents=True, exist_ok=True) 
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve()) # .resolve(): Returns the absolute path, with all symbolic links, relative .. or . components resolved (i.e., “cleaned up”)
print('PROC ->', PROC.resolve())

RAW -> /Users/fd/gitlocal/bootcamp_mingjia_jin/project/data/raw
PROC -> /Users/fd/gitlocal/bootcamp_mingjia_jin/project/data/processed


## 1) Create or Load a Sample DataFrame
You may reuse data from prior stages or create a small synthetic dataset.

In [25]:
# Use data from the prior stage
file_path = Path(RAW/"api_yfinance_AAPL_20250824-2226.csv")
df = pd.read_csv(file_path, parse_dates=["Date"])
df.head()

# dates = pd.date_range('2024-01-01', periods=20, freq='D')
# df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
# df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-01-02,27.8475,27.860001,26.8375,27.3325,24.261044,212818400
1,2015-01-05,27.0725,27.1625,26.352501,26.5625,23.577574,257142000
2,2015-01-06,26.635,26.8575,26.157499,26.565001,23.579796,263188400
3,2015-01-07,26.799999,27.049999,26.674999,26.9375,23.910435,160423600
4,2015-01-08,27.307501,28.0375,27.174999,27.9725,24.829128,237458000


## 2) Save CSV to data/raw/ and Parquet to data/processed/ (TODO)
- Use timestamped filenames.
- Handle missing Parquet engine gracefully.

In [28]:
def ts(): return datetime.datetime.now().strftime('%Y%m%d-%H%M%S') # yes can be written in just one line

# Use one timestamp for both
timestamp = ts()

# Save CSV
csv_path = RAW / f"sample_{timestamp}.csv"
df.to_csv(csv_path, index=False)
print(f"CSV saved to: {csv_path}")

# Save Parquet (handle engine not installed)
pq_path = PROC / f"sample_{timestamp}.parquet"
try:
    df.to_parquet(pq_path)
    print(f"Parquet saved to: {pq_path}")
except Exception as e:
    print("Parquet engine not available. Install pyarrow or fastparquet to complete this step.")
    pq_path = None

CSV saved to: /Users/fd/gitlocal/bootcamp_mingjia_jin/project/data/raw/sample_20250824-223858.csv
Parquet saved to: /Users/fd/gitlocal/bootcamp_mingjia_jin/project/data/processed/sample_20250824-223858.parquet


## 3) Reload and Validate (TODO)
- Compare shapes and key dtypes.

In [31]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['Date']) if 'Date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['Close']) if 'Close' in reloaded.columns else False,
    }
    return checks
    
df_csv = pd.read_csv(csv_path, parse_dates=['Date']) 
    # CSV files do NOT store data types. They only store plain text (strings), even for numbers and dates.
    # So it is pandas than convert data types here.
validate_loaded(df, df_csv)

{'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}

In [35]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        # Enforce datetime type for 'Date' column
        if 'Date' in df_pq.columns: # pandas.read_parquet() has no parse_dates parameter
            df_pq['Date'] = pd.to_datetime(df_pq['Date'])
        result = validate_loaded(df, df_pq)
        print("Parquet validation results:", result)
    except Exception as e:
        print('Parquet read failed:', e)

Parquet validation results: {'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}


## 4) Utilities (TODO)
- Implement `detect_format`, `write_df`, `read_df`.
- Use suffix to route; create parent dirs if needed; friendly errors for Parquet.

In [41]:
import typing as t # Python standard Module that provides type hinting tools — used to declare what type of values your functions expect and return.

# It takes a file path (as a string or Path object) and judges the file kind.
def detect_format(path: t.Union[str, pathlib.Path]) -> str: 
    # The function expects one parameter named path, and this path can be either a str or a pathlib.Path object.
    # Union[A, B] means: either A or B — both types are accepted.
    s = str(path).strip().lower()
    if s.endswith('.csv'):
        return 'csv'
    elif s.endswith(('.parquet', '.parq', '.pq')):
        return 'parquet'
    else:
        raise ValueError(f"Unsupported file format: {s}")

In [42]:
# write_df() is a unified save function that lets you save a DataFrame to either a CSV or Parquet file, 
# based on the file extension you pass in.
def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path); p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    elif fmt == 'parquet':
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    else:
        print("wrong value")
    return p

In [43]:
# Reads in either a CSV or Parquet file into a pandas DataFrame
def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['Date']) if 'Date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            df_ = pd.read_parquet(p)
            if 'Date' in df_.columns: # pandas.read_parquet() has no parse_dates parameter
                df_['Date'] = pd.to_datetime(df_['Date'])
            return df_
        except Exception as e: # `as e` and `from e` will keep the orginal error
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

In [44]:
# Demo
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"
write_df(df, p_csv); read_df(p_csv).head()
try:
    write_df(df, p_pq)
    read_df(p_pq).head()
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)

## 5) Documentation (TODO)
- Update README with a **Data Storage** section (folders, formats, env usage).
- Summarize validation checks and any assumptions.

done, check README.md 