# Stage 05: Data Storage and Management
**Project:** Turtle Trading Strategy Research  
**Author:** Panwei Hu  
**Date:** 2025-08-17

## Objectives
- Implement production-ready data storage for Turtle Trading research
- Support both CSV (raw) and Parquet (processed) formats
- Create robust I/O utilities for the research pipeline
- Establish data versioning and validation framework
- Optimize storage for time series analysis and backtesting

## Storage Architecture
- **Raw Data**: CSV format in `data/raw/` for auditability and human inspection
- **Processed Data**: Parquet format in `data/processed/` for performance and compression
- **Versioning**: Timestamped filenames with embedded metadata
- **Validation**: Comprehensive data integrity checks at every stage

In [1]:
import os, pathlib, datetime as dt
import pandas as pd
import numpy as np
import typing as t
import warnings
from dotenv import load_dotenv
import sys
sys.path.append('../src')

# Set up project paths
PROJECT_ROOT = pathlib.Path('..').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'

# Create directories
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Load environment variables
load_dotenv(PROJECT_ROOT / '.env')

print('🐢 Turtle Trading Data Storage Setup')
print(f'Project Root: {PROJECT_ROOT}')
print(f'Raw Data: {RAW_DIR}')
print(f'Processed Data: {PROCESSED_DIR}')

# Check existing data files
raw_files = list(RAW_DIR.glob('*.csv'))
processed_files = list(PROCESSED_DIR.glob('*.parquet'))
print(f'\n📁 Existing files:')
print(f'   Raw: {len(raw_files)} CSV files')
print(f'   Processed: {len(processed_files)} Parquet files')


🐢 Turtle Trading Data Storage Setup
Project Root: /Users/panweihu/Desktop/Desktop_m1/NYU_mfe/bootcamp/camp4/bootcamp_bill_panwei_hu/turtle_project
Raw Data: /Users/panweihu/Desktop/Desktop_m1/NYU_mfe/bootcamp/camp4/bootcamp_bill_panwei_hu/turtle_project/data/raw
Processed Data: /Users/panweihu/Desktop/Desktop_m1/NYU_mfe/bootcamp/camp4/bootcamp_bill_panwei_hu/turtle_project/data/processed

📁 Existing files:
   Raw: 18 CSV files
   Processed: 5 Parquet files


In [2]:
class TurtleDataStorage:
    """Production-ready data storage utilities for Turtle Trading research"""
    
    @staticmethod
    def detect_format(path: t.Union[str, pathlib.Path]) -> str:
        """Detect file format from extension"""
        s = str(path).lower()
        if s.endswith('.csv'): 
            return 'csv'
        if any(s.endswith(ext) for ext in ['.parquet', '.pq', '.parq']): 
            return 'parquet'
        if s.endswith('.json'):
            return 'json'
        raise ValueError(f'Unsupported format: {s}. Supported: .csv, .parquet, .json')
    
    @staticmethod
    def detect_date_columns(df: pd.DataFrame) -> list:
        """Smart detection of date columns for financial data"""
        date_cols = []
        for col in df.columns:
            if any(pattern in col.lower() for pattern in ['date', 'time', 'timestamp']):
                date_cols.append(col)
            elif df[col].dtype == 'object':
                sample = df[col].dropna().head()
                if len(sample) > 0:
                    try:
                        pd.to_datetime(sample.iloc[0])
                        date_cols.append(col)
                    except:
                        pass
        return date_cols
    
    @staticmethod
    def write_data(df: pd.DataFrame, filename: str, data_type: str = 'raw', **kwargs) -> pathlib.Path:
        """Write DataFrame to appropriate directory with format detection"""
        
        # Determine directory based on data type
        if data_type == 'raw':
            base_dir = RAW_DIR
            # Force CSV for raw data
            if not filename.endswith('.csv'):
                filename = filename.replace('.parquet', '').replace('.json', '') + '.csv'
        else:  # processed
            base_dir = PROCESSED_DIR
            # Prefer Parquet for processed data
            if not any(filename.endswith(ext) for ext in ['.parquet', '.csv', '.json']):
                filename += '.parquet'
        
        path = base_dir / filename
        fmt = TurtleDataStorage.detect_format(path)
        
        try:
            if fmt == 'csv':
                defaults = {'index': False, 'date_format': '%Y-%m-%d'}
                df.to_csv(path, **{**defaults, **kwargs})
                
            elif fmt == 'parquet':
                # Try different engines with fallback
                engines = ['pyarrow', 'fastparquet']
                last_error = None
                
                for engine in engines:
                    try:
                        defaults = {'engine': engine, 'compression': 'snappy'}
                        df.to_parquet(path, **{**defaults, **kwargs})
                        break
                    except ImportError as e:
                        last_error = e
                        continue
                    except Exception as e:
                        last_error = e
                        break
                else:
                    raise RuntimeError(f'No Parquet engine available. Install pyarrow or fastparquet. Last error: {last_error}')
                    
            elif fmt == 'json':
                defaults = {'orient': 'records', 'date_format': 'iso'}
                df.to_json(path, **{**defaults, **kwargs})
                
            print(f"💾 Saved {fmt.upper()}: {path.name} ({path.stat().st_size/1024:.1f} KB)")
            return path
            
        except Exception as e:
            raise RuntimeError(f'Failed to write {fmt.upper()} file {path}: {e}') from e
    
    @staticmethod
    def read_data(path: t.Union[str, pathlib.Path], **kwargs) -> pd.DataFrame:
        """Read DataFrame with smart parsing for financial data"""
        p = pathlib.Path(path)
        
        if not p.exists():
            raise FileNotFoundError(f'File not found: {p}')
            
        fmt = TurtleDataStorage.detect_format(p)
        
        try:
            if fmt == 'csv':
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    # Smart date parsing
                    sample_df = pd.read_csv(p, nrows=0)
                    date_cols = TurtleDataStorage.detect_date_columns(sample_df)
                    
                    defaults = {'parse_dates': date_cols} if date_cols else {}
                    df = pd.read_csv(p, **{**defaults, **kwargs})
                    
            elif fmt == 'parquet':
                engines = ['pyarrow', 'fastparquet']
                last_error = None
                
                for engine in engines:
                    try:
                        defaults = {'engine': engine}
                        df = pd.read_parquet(p, **{**defaults, **kwargs})
                        break
                    except ImportError as e:
                        last_error = e
                        continue
                    except Exception as e:
                        last_error = e
                        break
                else:
                    raise RuntimeError(f'No Parquet engine available. Last error: {last_error}')
                    
            elif fmt == 'json':
                df = pd.read_json(p, **kwargs)
                
            print(f"📖 Loaded {fmt.upper()}: {p.name} → {df.shape}")
            return df
            
        except Exception as e:
            raise RuntimeError(f'Failed to read {fmt.upper()} file {p}: {e}') from e

    @staticmethod
    def validate_financial_data(df: pd.DataFrame, required_cols: list = None) -> dict:
        """Comprehensive validation for financial time series data"""
        
        if required_cols is None:
            required_cols = ['date', 'symbol', 'adj_close']
        
        validation = {
            'shape': df.shape,
            'missing_cols': [c for c in required_cols if c not in df.columns],
            'total_nulls': df.isnull().sum().sum(),
            'duplicate_rows': df.duplicated().sum(),
        }
        
        # Financial data specific checks
        if 'date' in df.columns:
            validation['date_range'] = (df['date'].min(), df['date'].max())
            validation['date_gaps'] = len(pd.date_range(df['date'].min(), df['date'].max(), freq='D')) - df['date'].nunique()
        
        if 'symbol' in df.columns:
            validation['unique_symbols'] = df['symbol'].nunique()
            validation['symbols'] = sorted(df['symbol'].unique().tolist())
        
        if 'adj_close' in df.columns:
            validation['price_stats'] = {
                'min': df['adj_close'].min(),
                'max': df['adj_close'].max(),
                'mean': df['adj_close'].mean(),
                'negative_prices': (df['adj_close'] <= 0).sum()
            }
        
        # Overall data quality score
        issues = len([v for v in [
            validation['missing_cols'], 
            validation['total_nulls'], 
            validation['duplicate_rows'],
            validation.get('price_stats', {}).get('negative_prices', 0)
        ] if v])
        
        validation['quality_score'] = max(0, 100 - issues * 10)
        validation['status'] = 'GOOD' if validation['quality_score'] >= 80 else 'NEEDS_REVIEW'
        
        return validation

print("✅ TurtleDataStorage class loaded")


✅ TurtleDataStorage class loaded


In [3]:
# Load and process existing turtle data if available
print("🔍 Looking for existing turtle data...")

turtle_files = [f for f in raw_files if 'turtle' in f.name.lower()]

if turtle_files:
    # Load the most recent turtle data file
    latest_file = max(turtle_files, key=lambda x: x.stat().st_mtime)
    print(f"📊 Loading: {latest_file.name}")
    
    try:
        # Load raw data
        df_turtle_raw = TurtleDataStorage.read_data(latest_file)
        
        # Validate the data
        validation = TurtleDataStorage.validate_financial_data(df_turtle_raw)
        print(f"\n📋 Data Validation Results:")
        print(f"   Status: {validation['status']}")
        print(f"   Quality Score: {validation['quality_score']}/100")
        print(f"   Shape: {validation['shape']}")
        print(f"   Date Range: {validation.get('date_range', 'N/A')}")
        print(f"   Unique Symbols: {validation.get('unique_symbols', 'N/A')}")
        
        if validation['status'] == 'GOOD':
            # Save as processed data in Parquet format
            timestamp = dt.datetime.now().strftime('%Y%m%d_%H%M%S')
            processed_filename = f"turtle_universe_processed_{timestamp}.parquet"
            
            processed_path = TurtleDataStorage.write_data(
                df_turtle_raw, 
                processed_filename, 
                data_type='processed'
            )
            
            # Test round-trip data integrity
            df_processed = TurtleDataStorage.read_data(processed_path)
            
            # Validate round-trip
            shapes_match = df_turtle_raw.shape == df_processed.shape
            print(f"\n🔄 Round-trip Test: {'✅ PASSED' if shapes_match else '❌ FAILED'}")
            
            # Performance comparison
            raw_size = latest_file.stat().st_size / 1024
            processed_size = processed_path.stat().st_size / 1024
            compression_ratio = raw_size / processed_size if processed_size > 0 else 0
            
            print(f"\n📊 Storage Efficiency:")
            print(f"   Raw CSV: {raw_size:.1f} KB")
            print(f"   Processed Parquet: {processed_size:.1f} KB")
            print(f"   Compression Ratio: {compression_ratio:.1f}x")
            
        else:
            print(f"\n⚠️  Data quality issues detected. Review required.")
            print(f"   Missing columns: {validation['missing_cols']}")
            print(f"   Total nulls: {validation['total_nulls']}")
            print(f"   Duplicate rows: {validation['duplicate_rows']}")
            
    except Exception as e:
        print(f"❌ Error processing turtle data: {e}")
        
else:
    print("📝 No turtle data found. Run 04_data_acquisition.ipynb first to collect data.")
    
    # Create sample data for testing
    print("\n🧪 Creating sample data for testing...")
    
    sample_data = []
    symbols = ['SPY', 'QQQ', 'GLD']
    dates = pd.date_range('2024-01-01', periods=30, freq='D')
    
    for symbol in symbols:
        for date in dates:
            sample_data.append({
                'date': date,
                'symbol': symbol,
                'adj_close': 100 + np.random.randn() * 5,
                'asset_category': 'equity_us' if symbol in ['SPY', 'QQQ'] else 'commodities'
            })
    
    df_sample = pd.DataFrame(sample_data)
    
    # Test storage system with sample data
    sample_path = TurtleDataStorage.write_data(df_sample, 'sample_turtle_data.csv', 'raw')
    df_loaded = TurtleDataStorage.read_data(sample_path)
    
    validation_sample = TurtleDataStorage.validate_financial_data(df_loaded)
    print(f"Sample data validation: {validation_sample['status']} ({validation_sample['quality_score']}/100)")

print("\n✅ Data storage system ready for Turtle Trading research!")


🔍 Looking for existing turtle data...
📊 Loading: turtle_universe_source-yfinance_assets-multi_count-18_20250820-102058.csv
📖 Loaded CSV: turtle_universe_source-yfinance_assets-multi_count-18_20250820-102058.csv → (9036, 4)

📋 Data Validation Results:
   Status: GOOD
   Quality Score: 100/100
   Shape: (9036, 4)
   Date Range: (Timestamp('2023-08-21 00:00:00'), Timestamp('2025-08-20 00:00:00'))
   Unique Symbols: 18
💾 Saved PARQUET: turtle_universe_processed_20250820_102603.parquet (70.0 KB)
📖 Loaded PARQUET: turtle_universe_processed_20250820_102603.parquet → (9036, 4)

🔄 Round-trip Test: ✅ PASSED

📊 Storage Efficiency:
   Raw CSV: 394.5 KB
   Processed Parquet: 70.0 KB
   Compression Ratio: 5.6x

✅ Data storage system ready for Turtle Trading research!
