In [1]:
# =============================================================================
# NOTEBOOK: 02_batch_generation_final_validation_v2.ipynb (Corrected Import)
#
# PURPOSE:
# To perform a final, small-scale "dress rehearsal" of the historical
# factor generation process, specifically validating the batching mechanism.
#
# VERSION 2.0 CORRECTIONS:
# - Fixed `AttributeError` by importing `get_universe` directly from the
#   `run_factor_generation` script, not as an engine method.
# - Added corrected batch_insert_factor_scores function with chunking
# =============================================================================

import pandas as pd
from sqlalchemy import create_engine, text
from pathlib import Path
import sys
import yaml
from tqdm.notebook import tqdm
from datetime import datetime # Added for datetime.strptime

# FORCE RELOAD - Clear cached modules
import importlib
modules_to_reload = ['run_factor_generation', 'qvm_engine_v2_enhanced']
for module_name in modules_to_reload:
    if module_name in sys.modules:
        del sys.modules[module_name]
print("✅ Cleared cached modules")

# --- 1. ENVIRONMENT SETUP ---
print("="*70)
print("🚀 Batch Generation Final Validation Test (v2 - Corrected Import)")
print("="*70)

try:
    project_root = Path.cwd()
    while not (project_root / 'production').exists():
        project_root = project_root.parent
    print(f"✅ Project root identified at: {project_root}")

    sys.path.insert(0, str(project_root / 'production' / 'engine'))
    sys.path.insert(0, str(project_root / 'production' / 'scripts'))

    from qvm_engine_v2_enhanced import QVMEngineV2Enhanced
    from run_factor_generation import get_trading_dates, clear_existing_factor_scores, get_universe
    print("✅ Successfully imported production engine and script functions.")
except Exception as e:
    print(f"❌ CRITICAL ERROR: Could not set up environment. {e}")
    raise

# =============================================================================
# CORRECTED BATCH INSERT FUNCTION
# =============================================================================
def batch_insert_factor_scores_corrected(engine, factor_scores: list, strategy_version: str):
    """
    CORRECTED: Insert factor scores with proper DECIMAL(20,10) handling.
    This version includes both rounding AND chunking to handle large batches.
    """
    if not factor_scores:
        return

    try:
        insert_query = text("""
        INSERT INTO factor_scores_qvm (
            ticker, date, Quality_Composite, Value_Composite, Momentum_Composite, QVM_Composite,
            calculation_timestamp, strategy_version
        )
        VALUES (
            :ticker, :date, :Quality_Composite, :Value_Composite, :Momentum_Composite, :QVM_Composite,
            NOW(), :strategy_version
        )
        """)

        # Convert and round all records
        db_records = []
        for record in factor_scores:
            components = record['components']
            db_records.append({
                'ticker': record['ticker'],
                'date': record['date'],
                # DEFENSIVE ROUNDING to 10 decimal places
                'Quality_Composite': round(float(components.get('Quality_Composite', 0.0)), 10),
                'Value_Composite': round(float(components.get('Value_Composite', 0.0)), 10),
                'Momentum_Composite': round(float(components.get('Momentum_Composite', 0.0)), 10),
                'QVM_Composite': round(float(components.get('QVM_Composite', 0.0)), 10),
                'strategy_version': strategy_version
            })

        # CHUNK PROCESSING: Handle large batches in smaller chunks
        chunk_size = 50  # Process 50 records at a time
        total_inserted = 0

        for i in range(0, len(db_records), chunk_size):
            chunk = db_records[i:i + chunk_size]

            with engine.begin() as conn:
                conn.execute(insert_query, chunk)

            total_inserted += len(chunk)
            print(f"      Inserted chunk {i//chunk_size + 1}: {len(chunk)} records")

        print(f"    ✅ Successfully inserted {total_inserted} factor score records")

    except Exception as e:
        print(f"    ❌ Failed to insert factor scores: {e}")
        if db_records:
            print(f"    Sample record: {db_records[0]}")
        raise

# --- 2. TEST PARAMETERS ---
TEST_START_DATE = '2024-01-01'
TEST_END_DATE = '2024-01-02'
TEST_VERSION = 'batch_test_v1'
BATCH_SIZE = 1

print(f"\n🎯 Test Period: {TEST_START_DATE} to {TEST_END_DATE}")
print(f"🎯 Test Version: {TEST_VERSION}")
print(f"🎯 Batch Size: {BATCH_SIZE} dates")

# --- 3. INITIALIZE ENGINE & DB CONNECTION ---
print("\n🔧 Initializing engine and database connection...")
try:
    config_path = project_root / 'config'
    qvm_engine = QVMEngineV2Enhanced(config_path=str(config_path), log_level='WARNING')
    db_engine = qvm_engine.engine
    print("✅ Engine and DB connection ready.")
except Exception as e:
    print(f"❌ Initialization failed: {e}")
    raise

# --- 4. SIMULATE THE GENERATION SCRIPT'S LOGIC ---
print("\n" + "="*70)
print("🏃‍♂️ Simulating `run_factor_generation.py` Workflow")
print("="*70)

try:
    print(f"Step 1: Clearing any existing data for version '{TEST_VERSION}'...")
    clear_existing_factor_scores(db_engine, TEST_START_DATE, TEST_END_DATE, TEST_VERSION)

    print("\nStep 2: Fetching trading dates for the period...")
    trading_dates = get_trading_dates(db_engine, TEST_START_DATE, TEST_END_DATE)
    print(f"    - Found {len(trading_dates)} trading dates.")

    print("\nStep 3: Fetching full ticker universe...")
    universe = get_universe(db_engine)
    print(f"    - Found {len(universe)} tickers.")

    print(f"\nStep 4: Processing {len(trading_dates)} dates in batches of {BATCH_SIZE}...")
    total_dates = len(trading_dates)
    total_records_generated = 0

    for i in range(0, total_dates, BATCH_SIZE):
        batch_dates = trading_dates[i:i + BATCH_SIZE]
        batch_num = (i // BATCH_SIZE) + 1
        print(f"\n--- Processing Batch {batch_num} ({len(batch_dates)} dates from {batch_dates[0]} to {batch_dates[-1]}) ---")

        batch_records_to_insert = []
        for date_str in tqdm(batch_dates, desc=f"Batch {batch_num}"):
            analysis_date = pd.Timestamp(date_str)
            factor_scores_dict = qvm_engine.calculate_qvm_composite(analysis_date, universe)

            if factor_scores_dict:
                for ticker, components in factor_scores_dict.items():
                    batch_records_to_insert.append({
                        'ticker': ticker,
                        'date': analysis_date.date(),
                        'components': components
                    })

        print(f"    - Batch {batch_num}: Calculated {len(batch_records_to_insert)} factor scores.")

        if batch_records_to_insert:
            print(f"    - Batch {batch_num}: Inserting records into database...")
            # USE THE CORRECTED FUNCTION
            batch_insert_factor_scores_corrected(db_engine, batch_records_to_insert, TEST_VERSION)
            total_records_generated += len(batch_records_to_insert)
            print(f"    - Batch {batch_num}: Insertion complete.")

    print("\n✅ All batches processed successfully.")

    # --- 5. VERIFICATION ---
    print("\n" + "="*70)
    print("🔬 STEP 5: Verifying the Results in the Database")
    print("="*70)

    verify_query = text("SELECT date, COUNT(*) as record_count FROM factor_scores_qvm WHERE strategy_version = :version GROUP BY date ORDER BY date")
    with db_engine.connect() as connection:
        verification_df = pd.read_sql(verify_query, connection, params={'version': TEST_VERSION})

    print(f"Total records inserted: {verification_df['record_count'].sum()} (matches {total_records_generated})")
    print(f"Total dates inserted:   {len(verification_df)}")

    print("\n--- Sample of Inserted Data ---")
    # Using print(verification_df.to_string()) for consistent output in environments where display() might not render
    print(verification_df.to_string())

    if verification_df['record_count'].sum() == total_records_generated and len(verification_df) == len(trading_dates):
        print("\n🎉 SUCCESS: The batch generation and insertion process is validated.")
    else:
        print("\n❌ FAILURE: The number of records in the database does not match the number generated.")

finally:
    # --- 6. CLEANUP ---
    print("\n" + "="*70)
    print("🧹 STEP 6: Cleaning Up Test Data")
    print("="*70)
    with db_engine.begin() as connection:
        connection.execute(text("DELETE FROM factor_scores_qvm WHERE strategy_version = :version"), {'version': TEST_VERSION})
    print(f"✅ All records for version '{TEST_VERSION}' have been deleted.")

2025-07-25 18:39:22,971 - run_factor_generation - INFO - 🧹 Clearing existing factor scores for VERSION batch_test_v1 from 2024-01-01 to 2024-01-02
2025-07-25 18:39:22,979 - run_factor_generation - INFO - ✅ Cleared 0 records for version batch_test_v1 (other versions preserved)
2025-07-25 18:39:22,980 - run_factor_generation - INFO - 📅 Fetching trading dates from 2024-01-01 to 2024-01-02
2025-07-25 18:39:22,991 - run_factor_generation - INFO - ✅ Found 1 trading dates
2025-07-25 18:39:22,992 - run_factor_generation - INFO - 📊 Date range: 2024-01-02 to 2024-01-02
2025-07-25 18:39:22,992 - run_factor_generation - INFO - 📋 Fetching investment universe...
2025-07-25 18:39:23,000 - run_factor_generation - INFO - ✅ Found 728 tickers in universe


✅ Cleared cached modules
🚀 Batch Generation Final Validation Test (v2 - Corrected Import)
✅ Project root identified at: /Users/ducnguyen/Library/CloudStorage/GoogleDrive-duc.nguyentcb@gmail.com/My Drive/quant-world-invest/factor_investing_project
✅ Successfully imported production engine and script functions.

🎯 Test Period: 2024-01-01 to 2024-01-02
🎯 Test Version: batch_test_v1
🎯 Batch Size: 1 dates

🔧 Initializing engine and database connection...
✅ Engine and DB connection ready.

🏃‍♂️ Simulating `run_factor_generation.py` Workflow
Step 1: Clearing any existing data for version 'batch_test_v1'...

Step 2: Fetching trading dates for the period...
    - Found 1 trading dates.

Step 3: Fetching full ticker universe...
    - Found 728 tickers.

Step 4: Processing 1 dates in batches of 1...

--- Processing Batch 1 (1 dates from 2024-01-02 to 2024-01-02) ---


Batch 1:   0%|          | 0/1 [00:00<?, ?it/s]



    - Batch 1: Calculated 712 factor scores.
    - Batch 1: Inserting records into database...
      Inserted chunk 1: 50 records
      Inserted chunk 2: 50 records
      Inserted chunk 3: 50 records
      Inserted chunk 4: 50 records
      Inserted chunk 5: 50 records
      Inserted chunk 6: 50 records
      Inserted chunk 7: 50 records
      Inserted chunk 8: 50 records
      Inserted chunk 9: 50 records
      Inserted chunk 10: 50 records
      Inserted chunk 11: 50 records
      Inserted chunk 12: 50 records
      Inserted chunk 13: 50 records
      Inserted chunk 14: 50 records
      Inserted chunk 15: 12 records
    ✅ Successfully inserted 712 factor score records
    - Batch 1: Insertion complete.

✅ All batches processed successfully.

🔬 STEP 5: Verifying the Results in the Database
Total records inserted: 712 (matches 712)
Total dates inserted:   1

--- Sample of Inserted Data ---
         date  record_count
0  2024-01-02           712

🎉 SUCCESS: The batch generation and inse