## Isolation Tests and Skeletons
- Grain Validation, Assembler Tests.
- STEP 1 to 9 finale except LLM ( assembling check. )

In [1]:
"""
Grain Validation: Verify (company, year, doc, section, pos) = Unique Sentence

Tests that our sort key is functionally equivalent to sentenceID.
"""

from pathlib import Path
import sys
import polars as pl

# Setup
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

# ════════════════════════════════════════════════════════════════════════════
# Load Stage 2 Meta + Extract sentence_pos
# ════════════════════════════════════════════════════════════════════════════
from finrag_ml_tg1.rag_modules_src.utilities.sentence_utils import extract_sentence_position

meta_path = model_root / "finrag_ml_tg1/data_cache/meta_embeds/finrag_fact_sentences_meta_embeds.parquet"
meta_df = pl.read_parquet(meta_path)

print(f"✓ Loaded Stage 2 Meta: {len(meta_df):,} rows\n")

# Extract sentence_pos
meta_df = extract_sentence_position(meta_df, 'sentenceID')

print(f"✓ Extracted sentence_pos from sentenceID")
print(f"  Valid positions: {meta_df.filter(pl.col('sentence_pos') != -1).height:,}")
print(f"  Failed extractions: {meta_df.filter(pl.col('sentence_pos') == -1).height:,}\n")

# ════════════════════════════════════════════════════════════════════════════
# VALIDATION 1: Check Uniqueness of (cik, year, doc, section, pos)
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("VALIDATION 1: Grain Uniqueness Check")
print("="*80)
print()

# Group by the functional key
grain_check = meta_df.group_by([
    'cik_int',
    'report_year', 
    'docID',
    'section_name',
    'sentence_pos'
]).agg([
    pl.len().alias('row_count'),
    pl.col('sentenceID').n_unique().alias('unique_sentence_ids')
])

# Check for duplicates
duplicates = grain_check.filter(pl.col('row_count') > 1)

print(f"Total unique grain combinations: {len(grain_check):,}")
print(f"Duplicate grains (row_count > 1): {len(duplicates)}")
print()

if len(duplicates) == 0:
    print("✓ GRAIN IS UNIQUE - No duplicates found")
    print("  (cik_int, report_year, docID, section_name, sentence_pos) → Unique sentence")
else:
    print("✗ GRAIN HAS DUPLICATES - Multiple sentences map to same key:")
    print(duplicates.sort('row_count', descending=True).head(10))

print()

# ════════════════════════════════════════════════════════════════════════════
# VALIDATION 2: Check sentenceID vs Grain Equivalence
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("VALIDATION 2: sentenceID ↔ Grain Bijection")
print("="*80)
print()

# For each unique sentenceID, check if grain is also unique
sentence_grain_map = meta_df.group_by('sentenceID').agg([
    pl.len().alias('grain_count'),
    pl.col('cik_int').n_unique().alias('unique_ciks'),
    pl.col('docID').n_unique().alias('unique_docs'),
    pl.col('sentence_pos').n_unique().alias('unique_positions')
])

# Check if any sentenceID maps to multiple grains
multi_grain = sentence_grain_map.filter(
    (pl.col('unique_ciks') > 1) | 
    (pl.col('unique_docs') > 1) | 
    (pl.col('unique_positions') > 1)
)

print(f"Total unique sentenceIDs: {len(sentence_grain_map):,}")
print(f"sentenceIDs with multiple grains: {len(multi_grain)}")
print()

if len(multi_grain) == 0:
    print("✓ BIJECTION CONFIRMED - Each sentenceID maps to exactly one grain")
else:
    print("✗ BIJECTION BROKEN - Some sentenceIDs map to multiple grains:")
    print(multi_grain.head(10))

print()

# ════════════════════════════════════════════════════════════════════════════
# VALIDATION 3: Test Sort Order on Sample Data
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("VALIDATION 3: Sort Order Test (Sample Data)")
print("="*80)
print()

# Get a small sample: 2 companies, 2 years, 2 sections
sample = meta_df.filter(
    (pl.col('cik_int').is_in([1045810, 789019])) &  # NVIDIA, MSFT
    (pl.col('report_year').is_in([2018, 2019])) &
    (pl.col('section_name').is_in(['ITEM_1A', 'ITEM_7'])) &
    (pl.col('sentence_pos') != -1) &
    (pl.col('sentence_pos') <= 5)  # Just first 5 sentences per section
)

# Sort by our proposed order
sorted_sample = sample.sort([
    'name',           # company_name
    'report_year',    # year ASC
    'section_name',   # section
    'docID',          # doc
    'sentence_pos'    # position
])

# Display
print("Sample sentences sorted by (company, year ASC, section, doc, pos):\n")
print(sorted_sample.select([
    'name',
    'report_year',
    'section_name',
    'docID',
    'sentence_pos',
    'sentenceID',
    pl.col('sentence').str.slice(0, 60).alias('text_preview')
]).head(20))

print()

# ════════════════════════════════════════════════════════════════════════════
# VALIDATION 4: Simulated Assembly Output
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("VALIDATION 4: Simulated Assembly Format")
print("="*80)
print()

# Simulate what ContextAssembler would produce
context_parts = []
current_group = None

for row in sorted_sample.iter_rows(named=True):
    group_key = (row['name'], row['report_year'], row['section_name'])
    
    # Insert header when group changes
    if group_key != current_group:
        if context_parts:
            context_parts.append("")  # Spacing between groups
        
        header = f"=== {row['name']} | {row['report_year']} | {row['section_name']} ==="
        context_parts.append(header)
        context_parts.append("")
        current_group = group_key
    
    # Add sentence
    context_parts.append(row['sentence'])
    context_parts.append("")  # Double newline

assembled_context = "\n".join(context_parts)

print("Assembled context preview (first 1000 chars):")
print("─"*80)
print(assembled_context[:1000])
print("─"*80)
print()

print(f"Total assembled length: {len(assembled_context):,} characters")
print()

# ════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("GRAIN VALIDATION SUMMARY")
print("="*80)
print()

all_valid = (
    len(duplicates) == 0 and
    len(multi_grain) == 0
)

if all_valid:
    print("✓ GRAIN VALIDATED:")
    print("  • (cik_int, report_year, docID, section_name, sentence_pos) is UNIQUE")
    print("  • Functionally equivalent to sentenceID")
    print("  • Sort order produces logical chronological grouping")
    print("  • Ready to implement ContextAssembler")
else:
    print("✗ GRAIN ISSUES DETECTED:")
    if len(duplicates) > 0:
        print(f"  • {len(duplicates)} duplicate grain combinations")
    if len(multi_grain) > 0:
        print(f"  • {len(multi_grain)} sentenceIDs map to multiple grains")
    print("  • Investigate before implementing ContextAssembler")

print()
print("="*80)

✓ Loaded Stage 2 Meta: 469,252 rows

✓ Extracted sentence_pos from sentenceID
  Valid positions: 469,252
  Failed extractions: 0

VALIDATION 1: Grain Uniqueness Check

Total unique grain combinations: 469,252
Duplicate grains (row_count > 1): 0

✓ GRAIN IS UNIQUE - No duplicates found
  (cik_int, report_year, docID, section_name, sentence_pos) → Unique sentence

VALIDATION 2: sentenceID ↔ Grain Bijection

Total unique sentenceIDs: 469,252
sentenceIDs with multiple grains: 0

✓ BIJECTION CONFIRMED - Each sentenceID maps to exactly one grain

VALIDATION 3: Sort Order Test (Sample Data)

Sample sentences sorted by (company, year ASC, section, doc, pos):

shape: (20, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ name         ┆ report_year ┆ section_nam ┆ docID       ┆ sentence_po ┆ sentenceID  ┆ text_previe │
│ ---          ┆ ---         ┆ e           ┆ ---         ┆ s           ┆ ---         ┆ w           │
│ str          ┆ i64  

═══════════════════════════════════════════════════════════════════════════════

SUPPLY LINE 4: Complete RAG Retrieval Pipeline (Entity → Assembly)

═══════════════════════════════════════════════════════════════════════════════


In [1]:
"""
═══════════════════════════════════════════════════════════════════════════════
SUPPLY LINE 4: Complete RAG Retrieval Pipeline (Entity → Assembly)
═══════════════════════════════════════════════════════════════════════════════

Flow:
    Query 
      → EntityAdapter 
      → QueryEmbedderV2 
      → MetadataFilterBuilder
      → VariantPipeline (internal to S3Retriever)
      → S3VectorsRetriever
      → SentenceExpander
      → ContextAssembler
      → Formatted LLM context

Output: Saves assembled context as .txt file with metadata header
"""

from pathlib import Path
import sys
from datetime import datetime

# Setup
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"✓ Model root: {model_root}\n")

# ════════════════════════════════════════════════════════════════════════════
# IMPORTS
# ════════════════════════════════════════════════════════════════════════════
from finrag_ml_tg1.loaders.ml_config_loader import MLConfig
from finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter import EntityAdapter
from finrag_ml_tg1.rag_modules_src.utilities.query_embedder_v2 import (
    EmbeddingRuntimeConfig,
    QueryEmbedderV2
)
from finrag_ml_tg1.rag_modules_src.rag_pipeline.metadata_filters import MetadataFilterBuilder
from finrag_ml_tg1.rag_modules_src.rag_pipeline.variant_pipeline import VariantPipeline
from finrag_ml_tg1.rag_modules_src.rag_pipeline.s3_retriever import S3VectorsRetriever
from finrag_ml_tg1.rag_modules_src.rag_pipeline.sentence_expander import SentenceExpander
from finrag_ml_tg1.rag_modules_src.rag_pipeline.context_assembler import ContextAssembler

# ════════════════════════════════════════════════════════════════════════════
# INITIALIZE COMPONENTS
# ════════════════════════════════════════════════════════════════════════════
print("Initializing pipeline components...\n")

config = MLConfig()
bedrock_client = config.get_bedrock_client()

DIM_COMPANIES = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_companies_21.parquet"
DIM_SECTIONS = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_sec_sections.parquet"

# Step 1: Entity Adapter
adapter = EntityAdapter(company_dim_path=DIM_COMPANIES, section_dim_path=DIM_SECTIONS)

# Step 2: Query Embedder
embedding_cfg = config.cfg["embedding"]
runtime_cfg = EmbeddingRuntimeConfig.from_ml_config(embedding_cfg)
embedder = QueryEmbedderV2(runtime_cfg, boto_client=bedrock_client)

# Step 3: Filter Builder
filter_builder = MetadataFilterBuilder(config)

# Step 4: Variant Pipeline
variant_pipeline = VariantPipeline(config, adapter, embedder, bedrock_client)

# Step 5: S3 Retriever
retrieval_cfg = config.get_retrieval_config()
retriever = S3VectorsRetriever(
    retrieval_config=retrieval_cfg,
    aws_access_key_id=config.aws_access_key,
    aws_secret_access_key=config.aws_secret_key,
    region=config.region,
    variant_pipeline=variant_pipeline
)

# Steps 6-7: Sentence Expander
expander = SentenceExpander(config)

# Step 10: Context Assembler
assembler = ContextAssembler(config)

print("✓ All components initialized\n")

# ════════════════════════════════════════════════════════════════════════════
# QUERY
# ════════════════════════════════════════════════════════════════════════════
query = (
    "In the MD&A and Risk Factors sections, how did NVIDIA and Microsoft "
    "discuss their AI strategy, competitive positioning, and supply chain "
    "risks between 2017 and 2020?"
)

print("="*80)
print("QUERY")
print("="*80)
print(query)
print("\n")

# ════════════════════════════════════════════════════════════════════════════
# EXECUTE PIPELINE
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("EXECUTING RETRIEVAL PIPELINE")
print("="*80)
print()

# Step 1: Entity Extraction
print("→ Step 1: Entity Extraction")
entities = adapter.extract(query)
print(f"  ✓ Companies: {entities.companies.tickers}")
print(f"  ✓ Years: {entities.years.years}")
print(f"  ✓ Sections: {entities.sections}\n")

# Step 2: Query Embedding
print("→ Step 2: Query Embedding")
base_embedding = embedder.embed_query(query, entities)
print(f"  ✓ Generated {len(base_embedding)}-d embedding\n")

# Step 3: Metadata Filters
print("→ Step 3: Metadata Filters")
filtered_filters = filter_builder.build_filters(entities)
global_filters = filter_builder.build_global_filters(entities)
print(f"  ✓ Filtered: {filtered_filters}")
print(f"  ✓ Global: {global_filters}\n")

# Steps 4-5: S3 Retrieval (variants generated internally)
print("→ Steps 4-5: S3 Vectors Retrieval (with variants)")
bundle = retriever.retrieve(
    base_embedding=base_embedding,
    base_query=query,
    filtered_filters=filtered_filters,
    global_filters=global_filters
)
print(f"  ✓ Retrieved: {len(bundle.union_hits)} unique hits")
print(f"  ✓ Variants: {len(bundle.variant_queries)}\n")

# Steps 6-7: Sentence Expansion
print("→ Steps 6-7: Window Expansion + Deduplication")
unique_sentences = expander.expand_and_deduplicate(bundle.union_hits)
print(f"  ✓ Expanded to: {len(unique_sentences)} unique sentences\n")

# Step 10: Context Assembly
print("→ Step 10: Context Assembly")
context_str = assembler.assemble(unique_sentences)
print(f"  ✓ Assembled context ready\n")

# ════════════════════════════════════════════════════════════════════════════
# SAVE OUTPUT
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("SAVING OUTPUT")
print("="*80)
print()

# Create output directory
output_dir = model_root / "finrag_ml_tg1/rag_modules_src/test_outputs"
output_dir.mkdir(exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = output_dir / f"assembled_context_{timestamp}.txt"

# Prepare output with metadata header
output_content = f"""
═══════════════════════════════════════════════════════════════════════════════
FINRAG ASSEMBLED CONTEXT - RETRIEVAL PIPELINE OUTPUT
═══════════════════════════════════════════════════════════════════════════════

Query: {query}

Pipeline Execution:
  Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
  
Entities Extracted:
  Companies: {', '.join(entities.companies.tickers)}
  Years: {', '.join(map(str, entities.years.years))}
  Sections: {', '.join(entities.sections)}

Retrieval Stats:
  S3 Hits Retrieved: {len(bundle.union_hits)}
  Variants Generated: {len(bundle.variant_queries)}
  Unique Sentences After Expansion: {len(unique_sentences)}
  
Context Stats:
  Characters: {len(context_str):,}
  Estimated Tokens: {len(context_str)//4:,}
  Headers: {context_str.count('===')}

═══════════════════════════════════════════════════════════════════════════════
ASSEMBLED CONTEXT (Ready for LLM)
═══════════════════════════════════════════════════════════════════════════════

{context_str}

═══════════════════════════════════════════════════════════════════════════════
END OF CONTEXT
═══════════════════════════════════════════════════════════════════════════════
"""

# Save to file
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(output_content)

print(f"✓ Saved assembled context to:")
print(f"  {output_file}")
print(f"\nFile size: {output_file.stat().st_size:,} bytes")
print()

# ════════════════════════════════════════════════════════════════════════════
# PREVIEW
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("CONTEXT PREVIEW (First 1000 chars)")
print("="*80)
print()
print(context_str[:1000])
print("\n[... truncated ...]")
print()

# ════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("PIPELINE EXECUTION COMPLETE")
print("="*80)
print()
print(f"✓ Query processed successfully")
print(f"✓ Entities: {len(entities.companies.ciks_int)} companies, {len(entities.years.years)} years")
print(f"✓ Embeddings: 1 base + {len(bundle.variant_queries)} variants")
print(f"✓ S3 Retrieval: {len(bundle.union_hits)} hits")
print(f"✓ Expansion: {len(unique_sentences)} sentences")
print(f"✓ Assembly: {len(context_str):,} chars (~{len(context_str)//4:,} tokens)")
print(f"✓ Output saved: {output_file.name}")
print()
print("="*80)
print("✓ SUPPLY LINE 4 COMPLETE - RAG PIPELINE WORKING END-TO-END")
print("="*80)



✓ Model root: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline

Initializing pipeline components...

[DEBUG] ✓ AWS credentials loaded from aws_credentials.env
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ All components initialized

QUERY
In the MD&A and Risk Factors sections, how did NVIDIA and Microsoft discuss their AI strategy, competitive positioning, and supply chain risks between 2017 and 2020?


EXECUTING RETRIEVAL PIPELINE

→ Step 1: Entity Extraction
  ✓ Companies: ['MSFT', 'NVDA']
  ✓ Years: [2017, 2018, 2019, 2020]
  ✓ Sections: ['ITEM_7', 'ITEM_1A']

→ Step 2: Query Embedding
  ✓ Generated 1024-d embedding

→ Step 3: Metadata Filters
  ✓ Filtered: {'$and': [{'cik_int': {'$in': [789019, 1045810]}}, {'report_year': {'$in': [2017, 2018, 2019, 2020]}}, {'$or': [{'section_name': {'$eq': 'ITEM_7'}}, {'section_name': {'$eq': 'ITEM_1A'}}]}]}
  ✓ Global: {'$and': [{'cik_int': {'$in': [789019, 1045810]}}, 