## Mock Lambda and/or S3Loader Streamer Implementation. 

In [25]:
%load_ext autoreload
%autoreload 2

%reload_ext autoreload

# CELL 0: Bootstrap 
from pathlib import Path
import sys

current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find ModelPipeline")

if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"✓ Model root: {model_root}")


from finrag_ml_tg1.loaders.ml_config_loader import MLConfig  


# # Suppress noisy logs for clean notebook output
# logging.getLogger().setLevel(logging.WARNING)
# logging.getLogger("finrag_ml_tg1").setLevel(logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
✓ Model root: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline


In [26]:
## First isolation test.

# Minimal S3StreamingLoader test
import os
os.environ['AWS_LAMBDA_FUNCTION_NAME'] = 'test'

from finrag_ml_tg1.loaders.ml_config_loader import MLConfig
from finrag_ml_tg1.loaders.data_loader_strategy import S3StreamingLoader

config = MLConfig()
loader = S3StreamingLoader(config)

# Test each method
df1 = loader.load_stage2_meta()
df2 = loader.load_dimension_companies()
df3 = loader.load_dimension_sections()
df4 = loader.load_kpi_fact_data()

print(f"✓ Stage 2 Meta: {len(df1):,} rows")
print(f"✓ Companies: {len(df2):,} rows")
print(f"✓ Sections: {len(df3):,} rows")
print(f"✓ KPI Data: {len(df4):,} rows")




INFO     | finrag_ml_tg1.loaders.data_loader_strategy | S3StreamingLoader initialized (cache: \tmp\finrag_cache)
INFO     | finrag_ml_tg1.loaders.data_loader_strategy | Loading Stage 2 Meta from /tmp cache


[DEBUG] ✓ Found ModelPipeline via file path: D:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env


INFO     | finrag_ml_tg1.loaders.data_loader_strategy |   ✓ Loaded 469,252 rows from cache
INFO     | finrag_ml_tg1.loaders.data_loader_strategy | Loading KPI Fact Data from /tmp cache
INFO     | finrag_ml_tg1.loaders.data_loader_strategy |   ✓ Loaded 6,207 rows from cache


✓ Stage 2 Meta: 469,252 rows
✓ Companies: 21 rows
✓ Sections: 21 rows
✓ KPI Data: 6,207 rows


### learn key pattern:
```
# Mock Lambda environment
os.environ['AWS_LAMBDA_FUNCTION_NAME'] = 'anything-non-empty'

# Now MLConfig detects "Lambda"
config = MLConfig()
# config.is_lambda_environment = True
# config.data_loading_mode = 'S3_STREAMING'

# Factory creates S3StreamingLoader
data_loader = create_data_loader(config)
# type(data_loader) = S3StreamingLoader

# All data comes from S3
df = data_loader.load_stage2_meta()
# Downloads from s3://sentence-data-ingestion/ML_EMBED_ASSETS/...
```

In [27]:
# ============================================================================
# CELL 1: Setup - Path Resolution & Load Gold Test Suite
# ============================================================================

from pathlib import Path
import sys
import logging
import json

# Suppress noisy logs for clean notebook output
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger("finrag_ml_tg1").setLevel(logging.INFO)

# Find ModelPipeline root and add to sys.path
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")

if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"✓ ModelPipeline root: {model_root}")
print(f"✓ Notebook location: {Path.cwd()}\n")

# Construct absolute path to gold test suite
gold_path = model_root / "finrag_ml_tg1" / "data_cache" / "qa_manual_exports" / "goldp3_analysis" / "p3_gold_test_suite_31q.json"

if not gold_path.exists():
    raise FileNotFoundError(f"Gold test suite not found at: {gold_path}")

print(f"✓ Gold test suite: {gold_path}\n")

# Load all questions
with gold_path.open("r", encoding="utf-8") as f:
    all_questions = json.load(f)

# Selected question IDs for testing
SELECTED_IDS = [
    "P3V3-Q001",  # Walmart Debt Strategy 2018-2020 (cross-year, medium, 4 evidence)
    "P3V2-Q006",  # Microsoft Intelligent Cloud 2017 (local, medium, 1 evidence)
    "P3V3-Q004",  # Cross-Company Cyber 2009 (cross-company, medium, 3 evidence)
    "P3V3-Q007",  # Tesla Adjusted EBITDA 2022 (local, easy, 1 evidence)
    "P3V3-Q002",  # Meta Regulatory Evolution 2019-2024 (cross-year, hard, 4 evidence)


    "P3V2-Q015",  # Walmart Market/Competitive Risks 2021
                  # local, medium, 1 evidence # COVID-related risks - comprehensive but single long paragraph
    "P3V2-Q007",  # Genworth Regulatory Risks 2019
                  # local, medium, 1 evidence # Massive bullet list (14+ risk cues) - overwhelming detail
    "P3V2-Q013",  # Walmart Operational/Supply Chain Risks 2011
                  # local, hard, 1 evidence # Long narrative about natural disasters and disruptions
    
    "P3V2-Q001",  # Exxon Mobil Total Revenue 2008
                  # local, easy, 1 evidence
                  # BAD: Asks for revenue, answer is cross-reference boilerplate
    "P3V2-Q002",  # Eli Lilly Net Income 2006
                  # local, easy, 1 evidence
                  # BAD: Asks for net income, answer discusses valuation allowance
    "P3V2-Q004",  # Johnson & Johnson Cash Flow 2016
                  # local, easy, 1 evidence
                  # BAD: Asks for cash flow, answer is auditor's opinion statement
]

# Extract selected questions into structured dictionary
test_suite = {}
for q in all_questions:
    qid = q["question_id"]
    if qid in SELECTED_IDS:
        test_suite[qid] = {
            "question_text": q["question_text"],
            "gold_answer": q["answer_text"],
            "answer_type": q["answer_type"],
            "companies": q["company_name"],
            "years": q["years"],
            "retrieval_scope": q["retrieval_scope"],
            "difficulty": q["difficulty"],
            "evidence_count": len(q["evidence_sentence_ids"]),
            "evidence_ids": q["evidence_sentence_ids"],
        }

# Display summary
print("="*80)
print(f"LOADED {len(test_suite)} TEST QUESTIONS")
print("="*80)
for qid in SELECTED_IDS:
    q = test_suite[qid]
    print(f"\n{qid}:")
    print(f"  Companies: {', '.join(q['companies'])}")
    print(f"  Years: {q['years']}")
    print(f"  Scope: {q['retrieval_scope']} | Difficulty: {q['difficulty']} | Evidence: {q['evidence_count']} sentences")

✓ ModelPipeline root: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
✓ Notebook location: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\rag_modules_src\03_LambdaRefactor_Tests

✓ Gold test suite: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\data_cache\qa_manual_exports\goldp3_analysis\p3_gold_test_suite_31q.json

LOADED 11 TEST QUESTIONS

P3V3-Q001:
  Companies: Walmart Inc.
  Years: [2018, 2019, 2020]
  Scope: cross_year | Difficulty: medium | Evidence: 4 sentences

P3V2-Q006:
  Companies: MICROSOFT CORP
  Years: [2017]
  Scope: local | Difficulty: medium | Evidence: 1 sentences

P3V3-Q004:
  Companies: RADIAN GROUP INC, NETFLIX INC, Mastercard Inc
  Years: [2009]
  Scope: cross_company | Difficulty: medium | Evidence: 3 sentences

P3V3-Q007:
  Companies: Tesla, Inc.
  Years: [2022]
  Scope: local | Difficulty: easy | Evidence: 1 sentences

P3V3-Q002:

In [28]:
# ============================================================================
# CELL 1: Force S3StreamingLoader Mode (Mock Lambda)
# ============================================================================

import os
import sys
from pathlib import Path

# Find ModelPipeline root
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root")

if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"✓ Model root: {model_root}")

# ════════════════════════════════════════════════════════════════════════════
# FORCE S3StreamingLoader by mocking Lambda environment
# ════════════════════════════════════════════════════════════════════════════
os.environ['AWS_LAMBDA_FUNCTION_NAME'] = 'test-local-s3-streaming'
os.environ['LAMBDA_TASK_ROOT'] = str(model_root.parent)  # Mock Lambda root

print("✓ Lambda environment mocked")
print(f"  AWS_LAMBDA_FUNCTION_NAME: {os.environ['AWS_LAMBDA_FUNCTION_NAME']}")
print(f"  LAMBDA_TASK_ROOT: {os.environ['LAMBDA_TASK_ROOT']}")

# ============================================================================
# CELL 2: Configure Logging
# ============================================================================

import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(levelname)-8s | %(name)-40s | %(message)s',
    force=True
)

# Extra verbose for DataLoader
logging.getLogger('finrag_ml_tg1.loaders').setLevel(logging.DEBUG)

print("✓ Logging configured\n")

# ============================================================================
# CELL 3: Verify S3StreamingLoader Selection
# ============================================================================

from finrag_ml_tg1.loaders.ml_config_loader import MLConfig
from finrag_ml_tg1.loaders.data_loader_factory import create_data_loader

config = MLConfig()
data_loader = create_data_loader(config)

print("="*80)
print("DATALOADER ENVIRONMENT CHECK")
print("="*80)
print(f"Environment detected: {'Lambda' if config.is_lambda_environment else 'Local'}")
print(f"Data loading mode: {config.data_loading_mode}")
print(f"DataLoader type: {type(data_loader).__name__}")
print("="*80)

# Validate
assert config.is_lambda_environment, "Should detect Lambda environment"
assert config.data_loading_mode == 'S3_STREAMING', "Should use S3_STREAMING mode"
assert type(data_loader).__name__ == 'S3StreamingLoader', "Should create S3StreamingLoader"

print("\n✅ S3StreamingLoader selected successfully!")

# ============================================================================
# CELL 4: Test Individual Data Loading
# ============================================================================

print("\n" + "="*80)
print("TESTING S3 DATA LOADING")
print("="*80)

# Test 1: Load Stage 2 Meta
print("\n[1/4] Loading Stage 2 Meta from S3...")
stage2_df = data_loader.load_stage2_meta()
print(f"  ✓ Loaded {len(stage2_df):,} rows")
print(f"  ✓ Memory: {stage2_df.estimated_size('mb'):.1f} MB")
print(f"  ✓ Type: {type(stage2_df)}")

# Test 2: Load Companies Dimension
print("\n[2/4] Loading Companies Dimension from S3...")
companies_df = data_loader.load_dimension_companies()
print(f"  ✓ Loaded {len(companies_df):,} rows")

# Test 3: Load Sections Dimension
print("\n[3/4] Loading Sections Dimension from S3...")
sections_df = data_loader.load_dimension_sections()
print(f"  ✓ Loaded {len(sections_df):,} rows")

# Test 4: Load KPI Fact Data
print("\n[4/4] Loading KPI Fact Data from S3...")
kpi_df = data_loader.load_kpi_fact_data()
print(f"  ✓ Loaded {len(kpi_df):,} rows")

print("\n✅ ALL DATA LOADED FROM S3 SUCCESSFULLY!")

# ============================================================================
# CELL 5: Test /tmp Caching Behavior
# ============================================================================

print("\n" + "="*80)
print("TESTING /tmp CACHE BEHAVIOR")
print("="*80)

# Check cache directory
from pathlib import Path
import tempfile

cache_dir = Path(tempfile.gettempdir()) / 'finrag_cache'
print(f"Cache directory: {cache_dir}")
print(f"Cache exists: {cache_dir.exists()}")

if cache_dir.exists():
    cached_files = list(cache_dir.glob("*.parquet"))
    print(f"\nCached files ({len(cached_files)}):")
    for f in cached_files:
        size_mb = f.stat().st_size / (1024**2)
        print(f"  - {f.name}: {size_mb:.1f} MB")

# Test warm start (second load should be faster)
import time

print("\n[Cold Start Test]")
# Clear cache first
if cache_dir.exists():
    import shutil
    shutil.rmtree(cache_dir)
    cache_dir.mkdir(parents=True)

start = time.time()
df1 = data_loader.load_stage2_meta()
cold_time = time.time() - start
print(f"  Cold start: {cold_time:.2f}s (download from S3)")

print("\n[Warm Start Test]")
start = time.time()
df2 = data_loader.load_stage2_meta()
warm_time = time.time() - start
print(f"  Warm start: {warm_time:.2f}s (from /tmp cache)")

speedup = cold_time/warm_time if warm_time > 0 else 0
print(f"\n  Speedup: {speedup:.1f}x faster on warm start" if speedup > 0 else "\n  Speedup: N/A")

# ============================================================================
# CELL 6: Full Integration Test with S3StreamingLoader
# ============================================================================

print("\n" + "="*80)
print("FULL INTEGRATION TEST - S3StreamingLoader")
print("="*80)

from finrag_ml_tg1.rag_modules_src.synthesis_pipeline.orchestrator import answer_query

query = "What was Microsoft's revenue in 2021?"

print(f"\nQuery: {query}")
print("\nProcessing with S3StreamingLoader (all data from S3)...")
print("-"*80)

result = answer_query(
    query=query,
    model_root=model_root,
    include_kpi=True,
    include_rag=True,
    model_key=None,
    export_context=True,
    export_response=True
)

print("-"*80)

if result.get('error'):
    print(f"\n❌ ERROR: {result['error']}")
    print(f"Stage: {result['stage']}")
else:
    print(f"\n✅ SUCCESS")
    print(f"\nAnswer: {result['answer'][:200]}...")
    
    llm_meta = result['metadata']['llm']
    print(f"\nCost: ${llm_meta['cost']:.4f}")
    print(f"Tokens: {llm_meta['input_tokens']:,} in / {llm_meta['output_tokens']:,} out")

print("\n" + "="*80)
print("✅ S3StreamingLoader FULLY VALIDATED")
print("="*80)

# ============================================================================
# CELL 7: Cleanup
# ============================================================================

# Remove Lambda env vars
if 'AWS_LAMBDA_FUNCTION_NAME' in os.environ:
    del os.environ['AWS_LAMBDA_FUNCTION_NAME']
if 'LAMBDA_TASK_ROOT' in os.environ:
    del os.environ['LAMBDA_TASK_ROOT']

print("\n✓ Environment variables cleaned up")
print("✓ Back to normal local mode")


INFO     | finrag_ml_tg1.loaders.data_loader_factory | Creating data loader: mode=S3_STREAMING
INFO     | finrag_ml_tg1.loaders.data_loader_factory |   → S3StreamingLoader (Lambda or cloud environment)
INFO     | finrag_ml_tg1.loaders.data_loader_strategy | S3StreamingLoader initialized (cache: \tmp\finrag_cache)
INFO     | finrag_ml_tg1.loaders.data_loader_strategy | Loading Stage 2 Meta from /tmp cache


✓ Model root: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
✓ Lambda environment mocked
  AWS_LAMBDA_FUNCTION_NAME: test-local-s3-streaming
  LAMBDA_TASK_ROOT: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights
✓ Logging configured

[DEBUG] ✓ Lambda environment detected: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env
DATALOADER ENVIRONMENT CHECK
Environment detected: Lambda
Data loading mode: S3_STREAMING
DataLoader type: S3StreamingLoader

✅ S3StreamingLoader selected successfully!

TESTING S3 DATA LOADING

[1/4] Loading Stage 2 Meta from S3...


INFO     | finrag_ml_tg1.loaders.data_loader_strategy |   ✓ Loaded 469,252 rows from cache


  ✓ Loaded 469,252 rows
  ✓ Memory: 344.5 MB
  ✓ Type: <class 'polars.dataframe.frame.DataFrame'>

[2/4] Loading Companies Dimension from S3...
  ✓ Loaded 21 rows

[3/4] Loading Sections Dimension from S3...


INFO     | finrag_ml_tg1.loaders.data_loader_strategy | Loading KPI Fact Data from /tmp cache
INFO     | finrag_ml_tg1.loaders.data_loader_strategy |   ✓ Loaded 6,207 rows from cache
INFO     | finrag_ml_tg1.rag_modules_src.synthesis_pipeline.orchestrator | answer_query called: 'What was Microsoft's revenue in 2021?...'
INFO     | finrag_ml_tg1.loaders.data_loader_factory | Creating data loader: mode=S3_STREAMING
INFO     | finrag_ml_tg1.loaders.data_loader_factory |   → S3StreamingLoader (Lambda or cloud environment)
INFO     | finrag_ml_tg1.loaders.data_loader_strategy | S3StreamingLoader initialized (cache: \tmp\finrag_cache)
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter | EntityAdapter initializing with DataLoader...


  ✓ Loaded 21 rows

[4/4] Loading KPI Fact Data from S3...
  ✓ Loaded 6,207 rows

✅ ALL DATA LOADED FROM S3 SUCCESSFULLY!

TESTING /tmp CACHE BEHAVIOR
Cache directory: C:\Users\joems\AppData\Local\Temp\finrag_cache
Cache exists: False

[Cold Start Test]
  Cold start: 0.00s (download from S3)

[Warm Start Test]
  Warm start: 0.00s (from /tmp cache)

  Speedup: N/A

FULL INTEGRATION TEST - S3StreamingLoader

Query: What was Microsoft's revenue in 2021?

Processing with S3StreamingLoader (all data from S3)...
--------------------------------------------------------------------------------
[DEBUG] ✓ Lambda environment detected: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env
[DEBUG] ✓ Lambda environment detected: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env


INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter |   ✓ Loaded 21 companies
  ✓ Loaded 21 sections
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.company_universe | Building CompanyUniverse from DataFrame: 21 rows
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.company_universe | Building indexes from dim with 21 valid rows
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.company_universe | CompanyUniverse initialized from DataFrame: 21 companies, 21 tickers, 21 alias tokens
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.section_universe | Building SectionUniverse from DataFrame: 21 rows
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.section_universe | SectionUniverse initialized from DataFrame: 21 sections
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.section_extractor | SectionExtractor initialized with 75 keyword rules, 22 item patterns, 7 risk topics
INFO     | finrag_ml_tg1.rag_modules_src.metric_pipeline.src

[DEBUG] ✓ Lambda environment detected: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env


INFO     | finrag_ml_tg1.rag_modules_src.rag_pipeline.s3_retriever | ═══════════════════════════════════════════════════════════════
Starting retrieval for: 'What was Microsoft's revenue in 2021?...'
Config: global=True, variants=True
═══════════════════════════════════════════════════════════════
INFO     | finrag_ml_tg1.rag_modules_src.rag_pipeline.s3_retriever | → Generating variants via VariantPipeline...
INFO     | finrag_ml_tg1.rag_modules_src.rag_pipeline.variant_pipeline | Generating variants for: 'What was Microsoft's revenue in 2021?...'
INFO     | finrag_ml_tg1.rag_modules_src.rag_pipeline.variant_generator | Generated 3 variants for query
INFO     | finrag_ml_tg1.rag_modules_src.rag_pipeline.variant_pipeline | ✓ Generated 3 variant queries
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter | EntityAdapter.extract: starting for query='How much revenue did Microsoft generate in 2021?'
INFO     | finrag_ml_tg1.rag_modules_src.entity_adapter.company_extracto

--------------------------------------------------------------------------------

✅ SUCCESS

Answer: I don't have Microsoft's fiscal year 2021 revenue data available. The narrative context provided covers Microsoft's 10-K filings for fiscal years 2016, 2018, 2019, and 2020, but does not include the F...

Cost: $0.0048
Tokens: 3,842 in / 197 out

✅ S3StreamingLoader FULLY VALIDATED

✓ Environment variables cleaned up
✓ Back to normal local mode
