# Environment Setup

In [2]:
# Environment Setup
import os
import pathlib
import datetime as dt
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get paths from .env or use defaults
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))

# Create directories if they don't exist
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> C:\Users\Aislay\bootcamp_Ziyi_Yang\homework\stage05_data-storage\data\raw
PROC -> C:\Users\Aislay\bootcamp_Ziyi_Yang\homework\stage05_data-storage\data\processed


# Create Sample Data

In [3]:
# Create sample stock data
dates = pd.date_range('2025-01-01', periods=20, freq='D')
df = pd.DataFrame({
    'date': dates,
    'ticker': ['AAPL']*20,
    'price': 150 + np.random.randn(20).cumsum()
})

df.head()

Unnamed: 0,date,ticker,price
0,2025-01-01,AAPL,150.295484
1,2025-01-02,AAPL,149.596111
2,2025-01-03,AAPL,148.875938
3,2025-01-04,AAPL,150.402858
4,2025-01-05,AAPL,152.245824


# Save in Both Formats

In [4]:
def get_timestamp():
    """Generate timestamp for filenames"""
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# Save CSV to raw
csv_path = RAW / f"stock_{get_timestamp()}.csv"
df.to_csv(csv_path, index=False)
print(f"Saved CSV to: {csv_path}")

# Save Parquet to processed with engine fallback
parquet_path = PROC / f"stock_{get_timestamp()}.parquet"
try:
    # Try pyarrow first
    df.to_parquet(parquet_path, engine='pyarrow')
    print(f"Saved Parquet to: {parquet_path}")
except Exception as e:
    try:
        # Fallback to fastparquet
        df.to_parquet(parquet_path, engine='fastparquet')
        print(f"Saved Parquet (fastparquet) to: {parquet_path}")
    except:
        print("Parquet save failed. Install pyarrow or fastparquet with:")
        print("pip install pyarrow")
        parquet_path = None

Saved CSV to: data\raw\stock_20250822-210448.csv
Saved Parquet to: data\processed\stock_20250822-210448.parquet


# Reload and Validate

In [5]:
def validate_loaded(original_df, loaded_df, source_name):
    """Validate reloaded data matches original"""
    results = {
        'Shape matches': original_df.shape == loaded_df.shape,
        'All columns present': set(original_df.columns) == set(loaded_df.columns)
    }
    
    # Only check dtypes if columns match
    if results['All columns present']:
        results.update({
            'Date is datetime': pd.api.types.is_datetime64_any_dtype(loaded_df['date']),
            'Price is numeric': pd.api.types.is_numeric_dtype(loaded_df['price'])
        })
    
    print(f"\n{source_name} Validation:")
    for check, result in results.items():
        print(f"- {check}: {result}")
    
    return results

# Validate CSV
df_csv = pd.read_csv(csv_path, parse_dates=['date'])
csv_validation = validate_loaded(df, df_csv, "CSV")

# Validate Parquet if saved successfully
if parquet_path:
    try:
        df_parquet = pd.read_parquet(parquet_path)
        parquet_validation = validate_loaded(df, df_parquet, "Parquet")
    except Exception as e:
        print(f"Parquet read failed: {e}")


CSV Validation:
- Shape matches: True
- All columns present: True
- Date is datetime: True
- Price is numeric: True

Parquet Validation:
- Shape matches: True
- All columns present: True
- Date is datetime: True
- Price is numeric: True


# Storage Utilities

In [6]:
def detect_file_format(filepath):
    """Detect file format from extension"""
    filepath = str(filepath).lower()
    if filepath.endswith('.csv'):
        return 'csv'
    elif any(filepath.endswith(ext) for ext in ['.parquet', '.pq']):
        return 'parquet'
    raise ValueError(f"Unsupported file format: {filepath}")

def write_df(df, filepath):
    """Save DataFrame based on file extension"""
    filepath = pathlib.Path(filepath)
    filepath.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        fmt = detect_file_format(filepath)
        if fmt == 'csv':
            df.to_csv(filepath, index=False)
        else:
            # Try pyarrow first, then fastparquet
            try:
                df.to_parquet(filepath, engine='pyarrow')
            except:
                df.to_parquet(filepath, engine='fastparquet')
        print(f"Saved via utility: {filepath}")
        return filepath
    except Exception as e:
        raise RuntimeError(f"Failed to save {filepath}: {str(e)}")

def read_df(filepath):
    """Load DataFrame based on file extension"""
    filepath = pathlib.Path(filepath)
    
    try:
        fmt = detect_file_format(filepath)
        if fmt == 'csv':
            # Auto-detect date columns
            date_cols = [col for col in pd.read_csv(filepath, nrows=1).columns 
                        if 'date' in col.lower()]
            return pd.read_csv(filepath, parse_dates=date_cols) if date_cols else pd.read_csv(filepath)
        else:
            # Try pyarrow first, then fastparquet
            try:
                return pd.read_parquet(filepath, engine='pyarrow')
            except:
                return pd.read_parquet(filepath, engine='fastparquet')
    except Exception as e:
        raise RuntimeError(f"Failed to read {filepath}: {str(e)}")

# Demo utility functions
util_csv = write_df(df, RAW / f"stock_util_{get_timestamp()}.csv")
util_parquet = write_df(df, PROC / f"stock_util_{get_timestamp()}.parquet")

# Validate utility-loaded data
df_util = read_df(util_csv)
validate_loaded(df, df_util, "Utility")

Saved via utility: data\raw\stock_util_20250822-210450.csv
Saved via utility: data\processed\stock_util_20250822-210450.parquet

Utility Validation:
- Shape matches: True
- All columns present: True
- Date is datetime: True
- Price is numeric: True


{'Shape matches': True,
 'All columns present': True,
 'Date is datetime': True,
 'Price is numeric': True}