## Isolation Tests and Skeletons
#### Post Supply lines. RAG Pipeline Concepts.
#### Expander dedupe things.


In [1]:
from pathlib import Path
import sys
import logging

logging.getLogger().setLevel(logging.WARNING)

current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")

if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"✓ Model root on sys.path: {model_root}")


METRIC_DATA_JSON = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/downloaded_data.json"
DIM_COMPANIES = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_companies_21.parquet"
DIM_SECTIONS = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_sec_sections.parquet"
print(f"✓ Metric data JSON path: {METRIC_DATA_JSON}")
print(f"✓ Dimension companies path: {DIM_COMPANIES}")
print(f"✓ Dimension sections path: {DIM_SECTIONS}")

✓ Model root on sys.path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
✓ Metric data JSON path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\rag_modules_src\metric_pipeline\data\downloaded_data.json
✓ Dimension companies path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\data_cache\dimensions\finrag_dim_companies_21.parquet
✓ Dimension sections path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\data_cache\dimensions\finrag_dim_sec_sections.parquet


In [2]:
"""
Edge Case Discovery with sentence_pos join from Stage 3
"""

from pathlib import Path
import sys
import polars as pl

# Setup
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

# ════════════════════════════════════════════════════════════════════════════
# Load Stage 2 Meta + Stage 3 (for sentence_pos)
# ════════════════════════════════════════════════════════════════════════════
meta_path = model_root / "finrag_ml_tg1/data_cache/meta_embeds/finrag_fact_sentences_meta_embeds.parquet"
stage3_path = model_root / "finrag_ml_tg1/data_cache/stage3_s3vectors/cohere_1024d/finrag_embeddings_s3vectors_cohere_1024d.parquet"

print("Loading tables...")
meta_df = pl.read_parquet(meta_path)
stage3_df = pl.read_parquet(stage3_path)

print(f"✓ Stage 2 Meta: {len(meta_df):,} rows")
print(f"✓ Stage 3 S3V:  {len(stage3_df):,} rows\n")

# ════════════════════════════════════════════════════════════════════════════
# Join sentence_pos from Stage 3
# ════════════════════════════════════════════════════════════════════════════
print("Joining sentence_pos from Stage 3...")

# Join on sentenceID only (embedding_id in Stage 3, not in hit context)
meta_with_pos = meta_df.join(
    stage3_df.select(['sentenceID', 'sentence_pos']).unique(subset=['sentenceID'], keep='first'),
    on='sentenceID',
    how='left'
)

# Check join success
missing_pos = meta_with_pos.filter(pl.col('sentence_pos').is_null()).height
print(f"✓ Joined sentence_pos")
print(f"  Sentences with pos: {len(meta_with_pos) - missing_pos:,}")
print(f"  Sentences without pos: {missing_pos:,}\n")

# ════════════════════════════════════════════════════════════════════════════
# Find NVIDIA 2018 ITEM_1A
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("Section Overview (NVIDIA 2018 ITEM_1A)")
print("="*80)

nvda_2018_1a = meta_with_pos.filter(
    (pl.col('cik_int') == 1045810) &
    (pl.col('report_year') == 2018) &
    (pl.col('section_name') == 'ITEM_1A') &
    (pl.col('sentence_pos').is_not_null())  # Must have position
)

if len(nvda_2018_1a) == 0:
    print("✗ No NVIDIA 2018 ITEM_1A, trying MSFT...")
    nvda_2018_1a = meta_with_pos.filter(
        (pl.col('cik_int') == 789019) &
        (pl.col('report_year') == 2018) &
        (pl.col('section_name') == 'ITEM_1A') &
        (pl.col('sentence_pos').is_not_null())
    )

stats = nvda_2018_1a.select([
    pl.col('name').first().alias('company'),
    pl.col('cik_int').first().alias('cik'),
    pl.count().alias('total_sentences'),
    pl.col('sentence_pos').min().alias('min_pos'),
    pl.col('sentence_pos').max().alias('max_pos'),
    pl.col('section_sentence_count').first().alias('section_count')
])

print(stats)
print()

max_pos = stats['max_pos'][0]
section_count = stats['section_count'][0]

print(f"Position range: [0, {max_pos}]")
print(f"Section count field: {section_count}\n")

# ════════════════════════════════════════════════════════════════════════════
# Select edge case sentences
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("Edge Case Sentence Selection")
print("="*80)

pos_first = 0
pos_near_start = min(2, max_pos)  # pos=2 or less if section tiny
pos_middle = max_pos // 2
pos_near_end = max(0, max_pos - 2)
pos_last = max_pos

print(f"\nTarget positions:")
print(f"  1. First:      pos={pos_first}")
print(f"  2. Near-start: pos={pos_near_start}")
print(f"  3. Middle:     pos={pos_middle}")
print(f"  4. Near-end:   pos={pos_near_end}")
print(f"  5. Last:       pos={pos_last}\n")

# Get sentences
edge_cases = nvda_2018_1a.filter(
    pl.col('sentence_pos').is_in([pos_first, pos_near_start, pos_middle, pos_near_end, pos_last])
).select([
    'sentenceID',
    'sentence_pos',
    'section_sentence_count',
    'cik_int',
    'report_year',
    'section_name',
    'name',
    'embedding_id',
    pl.col('sentence').str.slice(0, 100).alias('text_preview')
]).sort('sentence_pos')

print("Edge case sentences found:")
print(edge_cases)
print()

# ════════════════════════════════════════════════════════════════════════════
# Output test data
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("TEST DATA FOR ISOLATION TEST")
print("="*80)

test_ids = edge_cases['sentenceID'].to_list()
test_positions = edge_cases['sentence_pos'].to_list()
test_cik = edge_cases['cik_int'][0]
test_year = edge_cases['report_year'][0]
test_section = edge_cases['section_name'][0]
test_section_count = edge_cases['section_sentence_count'][0]
test_embedding_id = edge_cases['embedding_id'][0]

print("\nCopy this into isolation test:")
print("```python")
print(f"# Test context")
print(f"TEST_CIK = {test_cik}")
print(f"TEST_YEAR = {test_year}")
print(f"TEST_SECTION = '{test_section}'")
print(f"TEST_SECTION_COUNT = {test_section_count}")
print(f"TEST_EMBEDDING_ID = '{test_embedding_id}'")
print()
print(f"# Edge case sentence IDs")
print(f"test_sentence_ids = [")
for sid, pos in zip(test_ids, test_positions):
    # Classify
    if pos == 0:
        case = "First"
    elif pos == max_pos:
        case = "Last"
    elif pos < 3:
        case = "Near-start"
    elif pos > max_pos - 3:
        case = "Near-end"
    else:
        case = "Middle"
    print(f'    "{sid}",  # pos={pos} ({case})')
print("]")
print("```")
print()

print("="*80)
print("✓ Edge case discovery complete")
print(f"✓ Found {len(test_ids)} test sentences covering all edge cases")
print("="*80)

Loading tables...
✓ Stage 2 Meta: 469,252 rows
✓ Stage 3 S3V:  203,076 rows

Joining sentence_pos from Stage 3...
✓ Joined sentence_pos
  Sentences with pos: 203,076
  Sentences without pos: 266,176

Section Overview (NVIDIA 2018 ITEM_1A)
shape: (1, 6)
┌─────────────┬─────────┬─────────────────┬─────────┬─────────┬───────────────┐
│ company     ┆ cik     ┆ total_sentences ┆ min_pos ┆ max_pos ┆ section_count │
│ ---         ┆ ---     ┆ ---             ┆ ---     ┆ ---     ┆ ---           │
│ str         ┆ i32     ┆ u32             ┆ i16     ┆ i16     ┆ u32           │
╞═════════════╪═════════╪═════════════════╪═════════╪═════════╪═══════════════╡
│ NVIDIA CORP ┆ 1045810 ┆ 180             ┆ 1       ┆ 180     ┆ 180           │
└─────────────┴─────────┴─────────────────┴─────────┴─────────┴───────────────┘

Position range: [0, 180]
Section count field: 180

Edge Case Sentence Selection

Target positions:
  1. First:      pos=0
  2. Near-start: pos=2
  3. Middle:     pos=90
  4. Near-end:   

(Deprecated in version 0.20.5)
  pl.count().alias('total_sentences'),


In [4]:
"""
═══════════════════════════════════════════════════════════════════════════════
ISOLATION TEST: SentenceExpander (Visual Table Inspection)
═══════════════════════════════════════════════════════════════════════════════
"""

from pathlib import Path
import sys
import logging

logging.basicConfig(level=logging.WARNING)  # Reduce noise

current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

from finrag_ml_tg1.loaders.ml_config_loader import MLConfig
from finrag_ml_tg1.rag_modules_src.rag_pipeline.sentence_expander import SentenceExpander
from finrag_ml_tg1.rag_modules_src.rag_pipeline.models import S3Hit
import polars as pl

# ════════════════════════════════════════════════════════════════════════════
# TEST DATA
# ════════════════════════════════════════════════════════════════════════════
TEST_CIK = 1045810
TEST_YEAR = 2018
TEST_SECTION = 'ITEM_1A'
TEST_SECTION_COUNT = 180
TEST_EMBEDDING_ID = 'bedrock_cohere_v4_1024d_20251109_1407'

synthetic_hits = [
    S3Hit(
        sentence_id="0001045810_10-K_2018_section_1A_2",
        sentence_id_numsurrogate=100002, embedding_id=TEST_EMBEDDING_ID,
        distance=0.18, cik_int=TEST_CIK, report_year=TEST_YEAR,
        section_name=TEST_SECTION, sic="3674", sentence_pos=2,
        source="filtered", variant_id=0, section_sentence_count=TEST_SECTION_COUNT,
        raw_metadata={}
    ),
    S3Hit(
        sentence_id="0001045810_10-K_2018_section_1A_90",
        sentence_id_numsurrogate=100090, embedding_id=TEST_EMBEDDING_ID,
        distance=0.15, cik_int=TEST_CIK, report_year=TEST_YEAR,
        section_name=TEST_SECTION, sic="3674", sentence_pos=90,
        source="filtered", variant_id=0, section_sentence_count=TEST_SECTION_COUNT,
        raw_metadata={}
    ),
    S3Hit(
        sentence_id="0001045810_10-K_2018_section_1A_178",
        sentence_id_numsurrogate=100178, embedding_id=TEST_EMBEDDING_ID,
        distance=0.22, cik_int=TEST_CIK, report_year=TEST_YEAR,
        section_name=TEST_SECTION, sic="3674", sentence_pos=178,
        source="global", variant_id=0, section_sentence_count=TEST_SECTION_COUNT,
        raw_metadata={}
    ),
    S3Hit(
        sentence_id="0001045810_10-K_2018_section_1A_180",
        sentence_id_numsurrogate=100180, embedding_id=TEST_EMBEDDING_ID,
        distance=0.20, cik_int=TEST_CIK, report_year=TEST_YEAR,
        section_name=TEST_SECTION, sic="3674", sentence_pos=180,
        source="filtered", variant_id=1, section_sentence_count=TEST_SECTION_COUNT,
        raw_metadata={}
    ),
]

for hit in synthetic_hits:
    hit.sources = {hit.source}
    hit.variant_ids = {hit.variant_id}

# ════════════════════════════════════════════════════════════════════════════
# INITIALIZE & RUN
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("SENTENCE EXPANDER ISOLATION TEST")
print("="*80)
print(f"\nTest setup: 4 hits from NVIDIA 2018 ITEM_1A (window_size=±3)\n")

config = MLConfig()
expander = SentenceExpander(config)

# Get records BEFORE dedup (for window inspection)
all_records_before_dedup = expander._expand_windows(synthetic_hits)

# Get final result AFTER dedup
unique_sentences = expander._deduplicate_sentences(all_records_before_dedup)

# ════════════════════════════════════════════════════════════════════════════
# TABLE 1-4: Individual Hit Windows (BEFORE Dedup)
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("WINDOW EXPANSION (Before Deduplication)")
print("="*80)
print()

for i, hit in enumerate(synthetic_hits, start=1):
    print(f"\n{'─'*80}")
    print(f"HIT {i}: pos={hit.sentence_pos}, distance={hit.distance:.2f}, "
          f"source={hit.source}, variant_id={hit.variant_id}")
    print(f"{'─'*80}")
    
    # Filter records from this hit
    hit_records = [r for r in all_records_before_dedup 
                   if r.parent_hit_distance == hit.distance]
    
    # Convert to DataFrame
    hit_df = pl.DataFrame([
        {
            "sentence_id": r.sentence_id[-4:],  # Last 4 chars (just the position part)
            "pos": r.sentence_pos,
            "is_core": r.is_core_hit,
            "distance": r.parent_hit_distance,
            "sources": ",".join(sorted(r.sources)),
            "variant_ids": ",".join(map(str, sorted(r.variant_ids))),
            "text_preview": r.text[:60] + "..." if len(r.text) > 60 else r.text
        }
        for r in hit_records
    ]).sort("pos")
    
    print(hit_df)
    print(f"\nWindow stats: {len(hit_df)} sentences, "
          f"core at pos={hit.sentence_pos}")

print("\n" + "="*80)
print()

# ════════════════════════════════════════════════════════════════════════════
# TABLE 5: Final Output (AFTER Dedup)
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("FINAL OUTPUT (After Deduplication)")
print("="*80)
print()

final_df = pl.DataFrame([
    {
        "sentence_id": s.sentence_id[-4:],  # Last 4 chars
        "pos": s.sentence_pos,
        "is_core": s.is_core_hit,
        "distance": s.parent_hit_distance,
        "sources": ",".join(sorted(s.sources)),
        "variant_ids": ",".join(map(str, sorted(s.variant_ids))),
        "text_preview": s.text[:60] + "..." if len(s.text) > 60 else s.text
    }
    for s in unique_sentences
]).sort("pos")

print(final_df)
print()

# ════════════════════════════════════════════════════════════════════════════
# SIMPLE VALIDATION: No Duplicates
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("VALIDATION: Deduplication Check")
print("="*80)
print()

duplicates = final_df.group_by('sentence_id').agg(pl.len().alias('count')).filter(pl.col('count') > 1)

if len(duplicates) == 0:
    print("✓ NO DUPLICATE SENTENCES - Deduplication working correctly!")
else:
    print(f"✗ FOUND {len(duplicates)} DUPLICATES:")
    print(duplicates)

print()

# ════════════════════════════════════════════════════════════════════════════
# STATS
# ════════════════════════════════════════════════════════════════════════════
print("="*80)
print("SUMMARY")
print("="*80)
print()

core_count = final_df.filter(pl.col('is_core') == True).height
neighbor_count = final_df.filter(pl.col('is_core') == False).height

print(f"Input: {len(synthetic_hits)} S3Hits")
print(f"Output: {len(final_df)} unique sentences")
print(f"  Core hits: {core_count}")
print(f"  Neighbors: {neighbor_count}")
print(f"  Duplicates: {len(duplicates)}")
print()

if len(duplicates) == 0 and core_count == len(synthetic_hits):
    print("✓ TEST PASSED - SentenceExpander working correctly!")
else:
    print("⚠ Review tables above for issues")




INFO - Loading Stage 2 meta table: D:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\data_cache\meta_embeds\finrag_fact_sentences_meta_embeds.parquet


SENTENCE EXPANDER ISOLATION TEST

Test setup: 4 hits from NVIDIA 2018 ITEM_1A (window_size=±3)

[DEBUG] ✓ AWS credentials loaded from aws_credentials.env


INFO - Extracting sentence_pos from sentenceID...
INFO - ✓ Loaded 469,252 sentences with extracted positions
  Valid positions: 469,252
  Failed extraction: 0
INFO - SentenceExpander initialized: window_size=±3 sentences
INFO -   Window expansion stats:
    Total records: 22
    Core hits: 4
    Neighbors: 18
    Avg window size: 5.5 sentences/hit
INFO -   Deduplication complete:
    22 records → 18 unique sentences
    Sentences with multiple versions: 4
    Final composition:
      Core hits: 4
      Neighbors: 14


WINDOW EXPANSION (Before Deduplication)


────────────────────────────────────────────────────────────────────────────────
HIT 1: pos=2, distance=0.18, source=filtered, variant_id=0
────────────────────────────────────────────────────────────────────────────────
shape: (5, 7)
┌─────────────┬─────┬─────────┬──────────┬──────────┬─────────────┬────────────────────────────────┐
│ sentence_id ┆ pos ┆ is_core ┆ distance ┆ sources  ┆ variant_ids ┆ text_preview                   │
│ ---         ┆ --- ┆ ---     ┆ ---      ┆ ---      ┆ ---         ┆ ---                            │
│ str         ┆ i64 ┆ bool    ┆ f64      ┆ str      ┆ str         ┆ str                            │
╞═════════════╪═════╪═════════╪══════════╪══════════╪═════════════╪════════════════════════════════╡
│ 1A_1        ┆ 1   ┆ false   ┆ 0.18     ┆ filtered ┆ 0           ┆ RISK FACTORS In evaluating     │
│             ┆     ┆         ┆          ┆          ┆             ┆ NVI…                           │
│ 1A_2        ┆ 