In [1]:
# Add parent directory to Python path
import sys
from pathlib import Path
sys.path.append(str(Path('../').resolve()))

# Core libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
import time
import warnings
warnings.filterwarnings('ignore')

# Progress monitoring
from tqdm.auto import tqdm
import logging

# Financial APIs
import yfinance as yf

# Database and utilities
import sqlalchemy
from dotenv import load_dotenv
from src.database import get_database_connection, get_api_key

# Load environment
load_dotenv()

# Configure logging for batch processing
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ Environment loaded and libraries imported successfully!")
print("🔄 Phase 1C: Historical Data Backfill Ready!")

# Configuration
TARGET_SYMBOLS = ['INTC', 'AMD', 'NVDA']
BATCH_SIZE = 50  # Process in batches to manage memory
RATE_LIMIT_DELAY = 0.1  # Delay between API calls (seconds)

print(f"📊 Target symbols: {TARGET_SYMBOLS}")
print(f"⚙️  Batch size: {BATCH_SIZE}")
print(f"⏱️  Rate limit delay: {RATE_LIMIT_DELAY}s")


✅ Environment loaded and libraries imported successfully!
🔄 Phase 1C: Historical Data Backfill Ready!
📊 Target symbols: ['INTC', 'AMD', 'NVDA']
⚙️  Batch size: 50
⏱️  Rate limit delay: 0.1s


In [2]:
def fetch_historical_data(symbol, start_date, end_date, retries=3):
    """
    Fetch historical market data with error handling and retries.
    
    Args:
        symbol (str): Stock symbol (e.g., 'INTC')
        start_date (str): Start date in 'YYYY-MM-DD' format
        end_date (str): End date in 'YYYY-MM-DD' format
        retries (int): Number of retry attempts
    
    Returns:
        pandas.DataFrame: Historical OHLCV data or None if failed
    """
    for attempt in range(retries):
        try:
            # Rate limiting
            time.sleep(RATE_LIMIT_DELAY)
            
            # Fetch data
            ticker = yf.Ticker(symbol)
            data = ticker.history(start=start_date, end=end_date)
            
            if data.empty:
                logger.warning(f"No data retrieved for {symbol} ({start_date} to {end_date})")
                return None
                
            # Clean the data
            data = data.drop(columns=['Dividends', 'Stock Splits'], errors='ignore')
            data.index = data.index.date  # Convert to date only
            
            logger.info(f"✅ Retrieved {len(data)} days of data for {symbol}")
            return data
            
        except Exception as e:
            logger.warning(f"Attempt {attempt + 1} failed for {symbol}: {e}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                logger.error(f"❌ Failed to fetch data for {symbol} after {retries} attempts")
                return None

def calculate_technical_indicators_batch(data):
    """
    Calculate technical indicators with optimized batch processing.
    
    Args:
        data (pandas.DataFrame): OHLCV data
    
    Returns:
        pandas.DataFrame: Data with technical indicators
    """
    if data is None or data.empty:
        return None
        
    result = data.copy()
    
    try:
        # Vectorized calculations for performance
        close_prices = result['Close']
        
        # Simple Moving Averages (1-8 days) - vectorized
        for period in range(1, 9):
            result[f'sma_{period}'] = close_prices.rolling(window=period, min_periods=1).mean()
        
        # Rate of Change (1-8 days) - vectorized
        for period in range(1, 9):
            result[f'roc_{period}'] = close_prices.pct_change(periods=period) * 100
        
        return result
        
    except Exception as e:
        logger.error(f"❌ Error calculating technical indicators: {e}")
        return data

def get_optimal_date_range():
    """
    Calculate optimal date range for historical data collection.
    
    Returns:
        tuple: (start_date, end_date) as strings
    """
    # Get 1 year + buffer for weekends/holidays
    end_date = date.today()
    start_date = end_date - timedelta(days=400)  # ~1.1 years to ensure we get 252+ trading days
    
    return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')

# Test the functions
print("🔍 Testing historical data collection functions...")

# Get optimal date range
start_date, end_date = get_optimal_date_range()
print(f"📅 Date range: {start_date} to {end_date}")

# Test with a small sample
print("\n🧪 Testing with INTC sample...")
test_sample = fetch_historical_data("INTC", start_date, end_date)

if test_sample is not None:
    print(f"✅ Sample data collected: {len(test_sample)} trading days")
    print(f"📅 Date range: {test_sample.index[0]} to {test_sample.index[-1]}")
    
    # Test technical indicators
    test_with_indicators = calculate_technical_indicators_batch(test_sample)
    if test_with_indicators is not None:
        print(f"📊 Technical indicators calculated successfully")
        print(f"🔢 Columns: {len(test_with_indicators.columns)} (OHLCV + 16 indicators)")
        
        # Show sample
        print("\n📈 Sample data (latest 3 days):")
        sample_cols = ['Close', 'sma_1', 'sma_5', 'roc_1', 'roc_5']
        display(test_with_indicators[sample_cols].tail(3).round(4))
else:
    print("⚠️  Sample test failed - will continue with error handling")


🔍 Testing historical data collection functions...
📅 Date range: 2024-05-24 to 2025-06-28

🧪 Testing with INTC sample...


2025-06-28 19:35:38,406 - INFO - ✅ Retrieved 273 days of data for INTC


✅ Sample data collected: 273 trading days
📅 Date range: 2024-05-24 to 2025-06-27
📊 Technical indicators calculated successfully
🔢 Columns: 21 (OHLCV + 16 indicators)

📈 Sample data (latest 3 days):


Unnamed: 0,Close,sma_1,sma_5,roc_1,roc_5
2025-06-25,22.2,22.2,21.702,-1.5521,6.7308
2025-06-26,22.5,22.5,21.904,1.3513,4.6999
2025-06-27,22.69,22.69,22.226,0.8444,7.6376


In [3]:
def store_historical_data_batch(symbol, data_with_indicators, batch_size=BATCH_SIZE):
    """
    Store historical data and technical indicators in batches for optimal performance.
    
    Args:
        symbol (str): Stock symbol
        data_with_indicators (pandas.DataFrame): Historical data with technical indicators
        batch_size (int): Number of records to process per batch
    
    Returns:
        dict: Processing results with counts and status
    """
    if data_with_indicators is None or data_with_indicators.empty:
        logger.warning("No data to store")
        return {'success': False, 'records_processed': 0, 'errors': ['No data']}
        
    try:
        engine = get_database_connection()
        results = {
            'success': True,
            'records_processed': 0,
            'market_data_inserted': 0,
            'indicators_inserted': 0,
            'errors': []
        }
        
        # Get symbol_id once
        with engine.connect() as conn:
            symbol_query = "SELECT id FROM symbols WHERE symbol = :symbol"
            symbol_result = conn.execute(sqlalchemy.text(symbol_query), {'symbol': symbol})
            symbol_row = symbol_result.fetchone()
            
            if not symbol_row:
                error_msg = f"Symbol {symbol} not found in database"
                logger.error(error_msg)
                return {'success': False, 'records_processed': 0, 'errors': [error_msg]}
                
            symbol_id = symbol_row[0]
            logger.info(f"Processing {len(data_with_indicators)} records for {symbol} (ID: {symbol_id})")
        
        # Process data in batches
        total_records = len(data_with_indicators)
        
        with tqdm(total=total_records, desc=f"Storing {symbol} data") as pbar:
            for start_idx in range(0, total_records, batch_size):
                end_idx = min(start_idx + batch_size, total_records)
                batch_data = data_with_indicators.iloc[start_idx:end_idx]
                
                try:
                    # Process batch with transaction
                    with engine.begin() as trans:
                        batch_market, batch_indicators = process_data_batch(
                            trans, symbol_id, batch_data
                        )
                        
                        results['market_data_inserted'] += batch_market
                        results['indicators_inserted'] += batch_indicators
                        results['records_processed'] += len(batch_data)
                        
                    pbar.update(len(batch_data))
                    
                    # Brief pause between batches
                    time.sleep(0.01)
                    
                except Exception as batch_error:
                    error_msg = f"Batch error ({start_idx}-{end_idx}): {batch_error}"
                    logger.error(error_msg)
                    results['errors'].append(error_msg)
                    pbar.update(len(batch_data))
                    continue
        
        logger.info(f"✅ Completed {symbol}: {results['records_processed']} records processed")
        return results
        
    except Exception as e:
        error_msg = f"Critical error storing data for {symbol}: {e}"
        logger.error(error_msg)
        return {'success': False, 'records_processed': 0, 'errors': [error_msg]}

def process_data_batch(transaction, symbol_id, batch_data):
    """
    Process a single batch of data within a database transaction.
    
    Args:
        transaction: Database transaction object
        symbol_id (int): Symbol ID from database
        batch_data (pandas.DataFrame): Batch of data to process
    
    Returns:
        tuple: (market_records_inserted, indicator_records_inserted)
    """
    market_records = 0
    indicator_records = 0
    
    # SQL queries as variables to avoid linter issues
    market_query = (
        "INSERT INTO market_data (symbol_id, trade_date, open_price, high_price, low_price, close_price, volume) "
        "VALUES (:symbol_id, :trade_date, :open_price, :high_price, :low_price, :close_price, :volume) "
        "ON CONFLICT (symbol_id, trade_date) DO UPDATE SET "
        "open_price = EXCLUDED.open_price, high_price = EXCLUDED.high_price, "
        "low_price = EXCLUDED.low_price, close_price = EXCLUDED.close_price, "
        "volume = EXCLUDED.volume, updated_at = CURRENT_TIMESTAMP"
    )
    
    indicators_query = (
        "INSERT INTO technical_indicators "
        "(symbol_id, trade_date, sma_1, sma_2, sma_3, sma_4, sma_5, sma_6, sma_7, sma_8, "
        "roc_1, roc_2, roc_3, roc_4, roc_5, roc_6, roc_7, roc_8) "
        "VALUES (:symbol_id, :trade_date, :sma_1, :sma_2, :sma_3, :sma_4, :sma_5, :sma_6, :sma_7, :sma_8, "
        ":roc_1, :roc_2, :roc_3, :roc_4, :roc_5, :roc_6, :roc_7, :roc_8) "
        "ON CONFLICT (symbol_id, trade_date) DO UPDATE SET "
        "sma_1 = EXCLUDED.sma_1, sma_2 = EXCLUDED.sma_2, sma_3 = EXCLUDED.sma_3, sma_4 = EXCLUDED.sma_4, "
        "sma_5 = EXCLUDED.sma_5, sma_6 = EXCLUDED.sma_6, sma_7 = EXCLUDED.sma_7, sma_8 = EXCLUDED.sma_8, "
        "roc_1 = EXCLUDED.roc_1, roc_2 = EXCLUDED.roc_2, roc_3 = EXCLUDED.roc_3, roc_4 = EXCLUDED.roc_4, "
        "roc_5 = EXCLUDED.roc_5, roc_6 = EXCLUDED.roc_6, roc_7 = EXCLUDED.roc_7, roc_8 = EXCLUDED.roc_8, "
        "updated_at = CURRENT_TIMESTAMP"
    )
    
    for trade_date, row in batch_data.iterrows():
        # Insert market data
        transaction.execute(sqlalchemy.text(market_query), {
            'symbol_id': symbol_id,
            'trade_date': trade_date,
            'open_price': float(row['Open']),
            'high_price': float(row['High']),
            'low_price': float(row['Low']),
            'close_price': float(row['Close']),
            'volume': int(row['Volume'])
        })
        market_records += 1
        
        # Prepare technical indicators data
        indicators_data = {'symbol_id': symbol_id, 'trade_date': trade_date}
        for i in range(1, 9):
            indicators_data[f'sma_{i}'] = float(row[f'sma_{i}']) if not pd.isna(row[f'sma_{i}']) else None
            indicators_data[f'roc_{i}'] = float(row[f'roc_{i}']) if not pd.isna(row[f'roc_{i}']) else None
        
        transaction.execute(sqlalchemy.text(indicators_query), indicators_data)
        indicator_records += 1
    
    return market_records, indicator_records

# Test batch storage with sample data
if 'test_with_indicators' in locals() and test_with_indicators is not None:
    print("\n🔍 Testing batch storage with sample data...")
    
    # Test with just the last 5 records
    test_batch = test_with_indicators.tail(5)
    storage_results = store_historical_data_batch("INTC", test_batch)
    
    if storage_results['success']:
        print(f"✅ Batch storage test successful!")
        print(f"📊 Records processed: {storage_results['records_processed']}")
        print(f"💾 Market data: {storage_results['market_data_inserted']}")
        print(f"📈 Indicators: {storage_results['indicators_inserted']}")
    else:
        print(f"❌ Batch storage test failed: {storage_results['errors']}")
else:
    print("⚠️  No test data available for batch storage test")


2025-06-28 19:35:51,300 - INFO - Processing 5 records for INTC (ID: 1)



🔍 Testing batch storage with sample data...


Storing INTC data:   0%|          | 0/5 [00:00<?, ?it/s]

2025-06-28 19:35:51,544 - INFO - ✅ Completed INTC: 5 records processed


✅ Batch storage test successful!
📊 Records processed: 5
💾 Market data: 5
📈 Indicators: 5


In [4]:
def run_historical_backfill(symbols=TARGET_SYMBOLS):
    """
    Master orchestrator for historical data backfill across multiple symbols.
    
    Args:
        symbols (list): List of stock symbols to process
    
    Returns:
        dict: Comprehensive processing results and statistics
    """
    logger.info("🚀 Starting Historical Data Backfill - Phase 1C")
    logger.info(f"📊 Processing symbols: {symbols}")
    
    start_time = time.time()
    start_date, end_date = get_optimal_date_range()
    
    # Initialize results tracking
    master_results = {
        'start_time': datetime.now(),
        'symbols_processed': [],
        'total_trading_days': 0,
        'total_market_records': 0,
        'total_indicator_records': 0,
        'processing_times': {},
        'errors': [],
        'success_rate': 0.0
    }
    
    logger.info(f"📅 Date range: {start_date} to {end_date}")
    
    # Process each symbol
    with tqdm(total=len(symbols), desc="Processing symbols", position=0) as symbol_pbar:
        for symbol in symbols:
            symbol_start_time = time.time()
            logger.info(f"\n📈 Processing {symbol}...")
            
            try:
                # Step 1: Fetch historical data
                logger.info(f"📥 Fetching historical data for {symbol}...")
                raw_data = fetch_historical_data(symbol, start_date, end_date)
                
                if raw_data is None:
                    error_msg = f"Failed to fetch data for {symbol}"
                    logger.error(error_msg)
                    master_results['errors'].append(error_msg)
                    symbol_pbar.update(1)
                    continue
                
                # Step 2: Calculate technical indicators
                logger.info(f"📊 Calculating technical indicators for {symbol}...")
                data_with_indicators = calculate_technical_indicators_batch(raw_data)
                
                if data_with_indicators is None:
                    error_msg = f"Failed to calculate indicators for {symbol}"
                    logger.error(error_msg)
                    master_results['errors'].append(error_msg)
                    symbol_pbar.update(1)
                    continue
                
                # Step 3: Store in database
                logger.info(f"💾 Storing {len(data_with_indicators)} records for {symbol}...")
                storage_results = store_historical_data_batch(symbol, data_with_indicators)
                
                if storage_results['success']:
                    # Update master results
                    symbol_processing_time = time.time() - symbol_start_time
                    master_results['symbols_processed'].append(symbol)
                    master_results['total_trading_days'] += storage_results['records_processed']
                    master_results['total_market_records'] += storage_results['market_data_inserted']
                    master_results['total_indicator_records'] += storage_results['indicators_inserted']
                    master_results['processing_times'][symbol] = symbol_processing_time
                    
                    logger.info(f"✅ {symbol} completed successfully!")
                    logger.info(f"📊 Records: {storage_results['records_processed']}")
                    logger.info(f"⏱️  Processing time: {symbol_processing_time:.2f}s")
                else:
                    error_msg = f"Failed to store data for {symbol}: {storage_results['errors']}"
                    logger.error(error_msg)
                    master_results['errors'].extend(storage_results['errors'])
                
                symbol_pbar.update(1)
                
                # Brief pause between symbols
                time.sleep(0.5)
                
            except Exception as e:
                error_msg = f"Critical error processing {symbol}: {e}"
                logger.error(error_msg)
                master_results['errors'].append(error_msg)
                symbol_pbar.update(1)
                continue
    
    # Calculate final statistics
    total_time = time.time() - start_time
    master_results['end_time'] = datetime.now()
    master_results['total_processing_time'] = total_time
    master_results['success_rate'] = len(master_results['symbols_processed']) / len(symbols) * 100
    
    # Final report
    logger.info("\n" + "="*60)
    logger.info("🎯 HISTORICAL DATA BACKFILL COMPLETED!")
    logger.info("="*60)
    logger.info(f"✅ Symbols processed: {len(master_results['symbols_processed'])}/{len(symbols)}")
    logger.info(f"📊 Success rate: {master_results['success_rate']:.1f}%")
    logger.info(f"📈 Total trading days: {master_results['total_trading_days']}")
    logger.info(f"💾 Market records: {master_results['total_market_records']}")
    logger.info(f"📊 Indicator records: {master_results['total_indicator_records']}")
    logger.info(f"⏱️  Total time: {total_time:.2f}s")
    
    if master_results['errors']:
        logger.warning(f"⚠️  Errors encountered: {len(master_results['errors'])}")
    
    return master_results

def validate_historical_data():
    """
    Comprehensive validation of stored historical data.
    
    Returns:
        dict: Validation results and statistics
    """
    logger.info("🔍 Validating historical data...")
    
    try:
        engine = get_database_connection()
        validation_results = {}
        
        with engine.connect() as conn:
            # Check market data coverage
            market_coverage_query = (
                "SELECT s.symbol, "
                "COUNT(md.id) as trading_days, "
                "MIN(md.trade_date) as earliest_date, "
                "MAX(md.trade_date) as latest_date, "
                "AVG(md.volume) as avg_volume "
                "FROM symbols s "
                "LEFT JOIN market_data md ON s.id = md.symbol_id "
                "WHERE s.symbol IN ('INTC', 'AMD', 'NVDA') "
                "GROUP BY s.symbol "
                "ORDER BY s.symbol"
            )
            
            market_df = pd.read_sql(market_coverage_query, conn)
            validation_results['market_coverage'] = market_df.to_dict('records')
            
            # Check technical indicators coverage
            indicators_coverage_query = (
                "SELECT s.symbol, "
                "COUNT(ti.id) as indicator_records, "
                "COUNT(CASE WHEN ti.sma_5 IS NOT NULL THEN 1 END) as sma_5_count, "
                "COUNT(CASE WHEN ti.roc_5 IS NOT NULL THEN 1 END) as roc_5_count "
                "FROM symbols s "
                "LEFT JOIN technical_indicators ti ON s.id = ti.symbol_id "
                "WHERE s.symbol IN ('INTC', 'AMD', 'NVDA') "
                "GROUP BY s.symbol "
                "ORDER BY s.symbol"
            )
            
            indicators_df = pd.read_sql(indicators_coverage_query, conn)
            validation_results['indicators_coverage'] = indicators_df.to_dict('records')
            
            # Data quality checks
            quality_query = (
                "SELECT s.symbol, "
                "COUNT(CASE WHEN md.volume = 0 THEN 1 END) as zero_volume_days, "
                "COUNT(CASE WHEN md.high_price < md.low_price THEN 1 END) as price_anomalies "
                "FROM symbols s "
                "JOIN market_data md ON s.id = md.symbol_id "
                "WHERE s.symbol IN ('INTC', 'AMD', 'NVDA') "
                "GROUP BY s.symbol "
                "ORDER BY s.symbol"
            )
            
            quality_df = pd.read_sql(quality_query, conn)
            validation_results['data_quality'] = quality_df.to_dict('records')
        
        logger.info("✅ Data validation completed successfully")
        return validation_results
        
    except Exception as e:
        logger.error(f"❌ Validation failed: {e}")
        return {'error': str(e)}

print("🚀 Historical backfill orchestrator ready!")
print("📊 Functions available:")
print("  • run_historical_backfill() - Execute complete backfill")
print("  • validate_historical_data() - Validate stored data")
print("\n💡 Ready to process 1+ year of historical data for all symbols!")


🚀 Historical backfill orchestrator ready!
📊 Functions available:
  • run_historical_backfill() - Execute complete backfill
  • validate_historical_data() - Validate stored data

💡 Ready to process 1+ year of historical data for all symbols!


In [5]:
# 🚀 EXECUTE HISTORICAL DATA BACKFILL
print("="*60)
print("🎯 PHASE 1C: HISTORICAL DATA BACKFILL")
print("="*60)
print(f"📊 Target Symbols: {TARGET_SYMBOLS}")
print(f"📅 Expected Time Range: ~1 year (252+ trading days)")
print(f"💾 Expected Records: ~750+ market records + 750+ indicator records")
print()

# Confirm execution
print("⚠️  This will process large amounts of data and may take several minutes.")
print("💡 Progress will be shown with detailed logging and progress bars.")
print()

# Execute the backfill
print("🚀 Starting historical data backfill...")
backfill_results = run_historical_backfill()

# Display results summary
print("\n" + "="*60)
print("📊 BACKFILL RESULTS SUMMARY")
print("="*60)

if backfill_results['success_rate'] > 0:
    print(f"✅ Success Rate: {backfill_results['success_rate']:.1f}%")
    print(f"📈 Symbols Processed: {len(backfill_results['symbols_processed'])}")
    print(f"📊 Total Trading Days: {backfill_results['total_trading_days']}")
    print(f"💾 Market Records: {backfill_results['total_market_records']}")
    print(f"📊 Indicator Records: {backfill_results['total_indicator_records']}")
    print(f"⏱️  Total Processing Time: {backfill_results['total_processing_time']:.2f}s")
    
    # Performance metrics
    if backfill_results['processing_times']:
        avg_time = sum(backfill_results['processing_times'].values()) / len(backfill_results['processing_times'])
        print(f"⚡ Average Time per Symbol: {avg_time:.2f}s")
        
        print("\n📊 Per-Symbol Performance:")
        for symbol, proc_time in backfill_results['processing_times'].items():
            records_per_symbol = backfill_results['total_trading_days'] // len(backfill_results['symbols_processed'])
            print(f"  • {symbol}: {proc_time:.2f}s (~{records_per_symbol} records)")
else:
    print("❌ Backfill failed completely")

if backfill_results['errors']:
    print(f"\n⚠️  Errors Encountered: {len(backfill_results['errors'])}")
    for error in backfill_results['errors'][:5]:  # Show first 5 errors
        print(f"  • {error}")
    if len(backfill_results['errors']) > 5:
        print(f"  ... and {len(backfill_results['errors']) - 5} more errors")

print("\n🎉 Phase 1C Historical Data Backfill Complete!")
print("Next: Run validation to verify data quality and completeness.")


2025-06-28 19:36:26,808 - INFO - 🚀 Starting Historical Data Backfill - Phase 1C
2025-06-28 19:36:26,809 - INFO - 📊 Processing symbols: ['INTC', 'AMD', 'NVDA']
2025-06-28 19:36:26,809 - INFO - 📅 Date range: 2024-05-24 to 2025-06-28


🎯 PHASE 1C: HISTORICAL DATA BACKFILL
📊 Target Symbols: ['INTC', 'AMD', 'NVDA']
📅 Expected Time Range: ~1 year (252+ trading days)
💾 Expected Records: ~750+ market records + 750+ indicator records

⚠️  This will process large amounts of data and may take several minutes.
💡 Progress will be shown with detailed logging and progress bars.

🚀 Starting historical data backfill...


Processing symbols:   0%|          | 0/3 [00:00<?, ?it/s]

2025-06-28 19:36:26,813 - INFO - 
📈 Processing INTC...
2025-06-28 19:36:26,813 - INFO - 📥 Fetching historical data for INTC...
2025-06-28 19:36:26,925 - INFO - ✅ Retrieved 273 days of data for INTC
2025-06-28 19:36:26,926 - INFO - 📊 Calculating technical indicators for INTC...
2025-06-28 19:36:26,930 - INFO - 💾 Storing 273 records for INTC...
2025-06-28 19:36:26,945 - INFO - Processing 273 records for INTC (ID: 1)


Storing INTC data:   0%|          | 0/273 [00:00<?, ?it/s]

2025-06-28 19:36:27,194 - INFO - ✅ Completed INTC: 273 records processed
2025-06-28 19:36:27,195 - INFO - ✅ INTC completed successfully!
2025-06-28 19:36:27,195 - INFO - 📊 Records: 273
2025-06-28 19:36:27,195 - INFO - ⏱️  Processing time: 0.38s
2025-06-28 19:36:27,700 - INFO - 
📈 Processing AMD...
2025-06-28 19:36:27,701 - INFO - 📥 Fetching historical data for AMD...
2025-06-28 19:36:27,911 - INFO - ✅ Retrieved 273 days of data for AMD
2025-06-28 19:36:27,912 - INFO - 📊 Calculating technical indicators for AMD...
2025-06-28 19:36:27,916 - INFO - 💾 Storing 273 records for AMD...
2025-06-28 19:36:27,932 - INFO - Processing 273 records for AMD (ID: 2)


Storing AMD data:   0%|          | 0/273 [00:00<?, ?it/s]

2025-06-28 19:36:28,185 - INFO - ✅ Completed AMD: 273 records processed
2025-06-28 19:36:28,185 - INFO - ✅ AMD completed successfully!
2025-06-28 19:36:28,186 - INFO - 📊 Records: 273
2025-06-28 19:36:28,186 - INFO - ⏱️  Processing time: 0.48s
2025-06-28 19:36:28,688 - INFO - 
📈 Processing NVDA...
2025-06-28 19:36:28,689 - INFO - 📥 Fetching historical data for NVDA...
2025-06-28 19:36:28,941 - INFO - ✅ Retrieved 273 days of data for NVDA
2025-06-28 19:36:28,941 - INFO - 📊 Calculating technical indicators for NVDA...
2025-06-28 19:36:28,946 - INFO - 💾 Storing 273 records for NVDA...
2025-06-28 19:36:28,958 - INFO - Processing 273 records for NVDA (ID: 3)


Storing NVDA data:   0%|          | 0/273 [00:00<?, ?it/s]

2025-06-28 19:36:29,205 - INFO - ✅ Completed NVDA: 273 records processed
2025-06-28 19:36:29,206 - INFO - ✅ NVDA completed successfully!
2025-06-28 19:36:29,206 - INFO - 📊 Records: 273
2025-06-28 19:36:29,207 - INFO - ⏱️  Processing time: 0.52s
2025-06-28 19:36:29,708 - INFO - 
2025-06-28 19:36:29,710 - INFO - 🎯 HISTORICAL DATA BACKFILL COMPLETED!
2025-06-28 19:36:29,712 - INFO - ✅ Symbols processed: 3/3
2025-06-28 19:36:29,715 - INFO - 📊 Success rate: 100.0%
2025-06-28 19:36:29,716 - INFO - 📈 Total trading days: 819
2025-06-28 19:36:29,717 - INFO - 💾 Market records: 819
2025-06-28 19:36:29,718 - INFO - 📊 Indicator records: 819
2025-06-28 19:36:29,718 - INFO - ⏱️  Total time: 2.90s



📊 BACKFILL RESULTS SUMMARY
✅ Success Rate: 100.0%
📈 Symbols Processed: 3
📊 Total Trading Days: 819
💾 Market Records: 819
📊 Indicator Records: 819
⏱️  Total Processing Time: 2.90s
⚡ Average Time per Symbol: 0.46s

📊 Per-Symbol Performance:
  • INTC: 0.38s (~273 records)
  • AMD: 0.48s (~273 records)
  • NVDA: 0.52s (~273 records)

🎉 Phase 1C Historical Data Backfill Complete!
Next: Run validation to verify data quality and completeness.


In [6]:
# 🔍 VALIDATE HISTORICAL DATA
print("="*60)
print("🔍 DATA VALIDATION & QUALITY ASSURANCE")
print("="*60)

# Run comprehensive validation
validation_results = validate_historical_data()

if 'error' not in validation_results:
    print("✅ Data validation completed successfully!\n")
    
    # Market Data Coverage Report
    print("📊 MARKET DATA COVERAGE")
    print("-" * 40)
    market_coverage = validation_results['market_coverage']
    
    for symbol_data in market_coverage:
        symbol = symbol_data['symbol']
        days = symbol_data['trading_days']
        earliest = symbol_data['earliest_date']
        latest = symbol_data['latest_date']
        avg_vol = symbol_data['avg_volume']
        
        print(f"📈 {symbol}:")
        print(f"  • Trading Days: {days}")
        print(f"  • Date Range: {earliest} to {latest}")
        print(f"  • Avg Volume: {avg_vol:,.0f}" if avg_vol else "  • Avg Volume: N/A")
        print()
    
    # Technical Indicators Coverage Report
    print("📊 TECHNICAL INDICATORS COVERAGE")
    print("-" * 40)
    indicators_coverage = validation_results['indicators_coverage']
    
    for symbol_data in indicators_coverage:
        symbol = symbol_data['symbol']
        indicator_records = symbol_data['indicator_records']
        sma_5_count = symbol_data['sma_5_count']
        roc_5_count = symbol_data['roc_5_count']
        
        print(f"📈 {symbol}:")
        print(f"  • Indicator Records: {indicator_records}")
        print(f"  • SMA-5 Coverage: {sma_5_count}/{indicator_records} ({sma_5_count/indicator_records*100:.1f}%)" if indicator_records > 0 else "  • SMA-5 Coverage: N/A")
        print(f"  • ROC-5 Coverage: {roc_5_count}/{indicator_records} ({roc_5_count/indicator_records*100:.1f}%)" if indicator_records > 0 else "  • ROC-5 Coverage: N/A")
        print()
    
    # Data Quality Report
    print("🔍 DATA QUALITY CHECKS")
    print("-" * 40)
    quality_data = validation_results['data_quality']
    
    total_anomalies = 0
    for symbol_data in quality_data:
        symbol = symbol_data['symbol']
        zero_volume = symbol_data['zero_volume_days']
        price_anomalies = symbol_data['price_anomalies']
        total_anomalies += zero_volume + price_anomalies
        
        print(f"📈 {symbol}:")
        print(f"  • Zero Volume Days: {zero_volume}")
        print(f"  • Price Anomalies: {price_anomalies}")
        print()
    
    if total_anomalies == 0:
        print("✅ No data quality issues detected!")
    else:
        print(f"⚠️  Total anomalies found: {total_anomalies}")
    
    # Summary Statistics
    print("\n" + "="*60)
    print("📋 PHASE 1C COMPLETION SUMMARY")
    print("="*60)
    
    total_market_records = sum(item['trading_days'] for item in market_coverage)
    total_indicator_records = sum(item['indicator_records'] for item in indicators_coverage)
    symbols_with_data = len([item for item in market_coverage if item['trading_days'] > 0])
    
    print(f"✅ Symbols Successfully Processed: {symbols_with_data}/3")
    print(f"📊 Total Market Data Records: {total_market_records:,}")
    print(f"📈 Total Technical Indicator Records: {total_indicator_records:,}")
    print(f"🎯 Data Quality Score: {((total_market_records + total_indicator_records - total_anomalies) / (total_market_records + total_indicator_records) * 100):.1f}%" if (total_market_records + total_indicator_records) > 0 else "N/A")
    
    # Success criteria check
    if total_market_records >= 750 and symbols_with_data == 3:
        print("\n🎉 PHASE 1C COMPLETE - ALL SUCCESS CRITERIA MET!")
        print("✅ 1+ year of historical data collected")
        print("✅ All 3 symbols (INTC, AMD, NVDA) processed")
        print("✅ Technical indicators calculated and stored")
        print("✅ Data quality validated")
        print("\n🚀 Ready for Phase 1D: Strategy Development & Backtesting!")
    else:
        print("\n⚠️  Some success criteria not fully met:")
        if total_market_records < 750:
            print(f"  • Need more historical data ({total_market_records}/750+ records)")
        if symbols_with_data < 3:
            print(f"  • Missing data for some symbols ({symbols_with_data}/3 symbols)")

else:
    print(f"❌ Validation failed: {validation_results['error']}")

print("\n" + "="*60)
print("Phase 1C Historical Data Backfill - COMPLETE")
print("="*60)


2025-06-28 19:36:48,349 - INFO - 🔍 Validating historical data...
2025-06-28 19:36:48,377 - INFO - ✅ Data validation completed successfully


🔍 DATA VALIDATION & QUALITY ASSURANCE
✅ Data validation completed successfully!

📊 MARKET DATA COVERAGE
----------------------------------------
📈 AMD:
  • Trading Days: 273
  • Date Range: 2024-05-24 to 2025-06-27
  • Avg Volume: 43,071,575

📈 INTC:
  • Trading Days: 273
  • Date Range: 2024-05-24 to 2025-06-27
  • Avg Volume: 81,708,799

📈 NVDA:
  • Trading Days: 273
  • Date Range: 2024-05-24 to 2025-06-27
  • Avg Volume: 282,966,432

📊 TECHNICAL INDICATORS COVERAGE
----------------------------------------
📈 AMD:
  • Indicator Records: 273
  • SMA-5 Coverage: 273/273 (100.0%)
  • ROC-5 Coverage: 268/273 (98.2%)

📈 INTC:
  • Indicator Records: 273
  • SMA-5 Coverage: 273/273 (100.0%)
  • ROC-5 Coverage: 268/273 (98.2%)

📈 NVDA:
  • Indicator Records: 273
  • SMA-5 Coverage: 273/273 (100.0%)
  • ROC-5 Coverage: 268/273 (98.2%)

🔍 DATA QUALITY CHECKS
----------------------------------------
📈 AMD:
  • Zero Volume Days: 0
  • Price Anomalies: 0

📈 INTC:
  • Zero Volume Days: 0
  • Price