## Isolation Tests and Skeletons - Part 1.

#### Supply Lines and Formatting, Connecting lines across Step 1-4 etc. towards end.

```
1. Supply Line 1 skeleton
2. user_query → EntityAdapter → QueryEmbedderV2 → MetricPipeline → compact KPI text
```

In [1]:
# CELL 1 — Global wiring + import/construct smoke tests

from pathlib import Path
import sys
import logging

logging.getLogger().setLevel(logging.WARNING)

# -------------------------------------------------------------------
# 1. Put ModelPipeline on sys.path
# -------------------------------------------------------------------
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")

if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"✓ Model root on sys.path: {model_root}")

# Small helper for structured checks
def check(label, fn):
    try:
        obj = fn()
        print(f"  ✓ {label}")
        return obj
    except Exception as e:
        print(f"  ✗ {label} -> {type(e).__name__}: {e}")
        raise

# -------------------------------------------------------------------
# 2. Core imports via finrag_ml_tg1 package
# -------------------------------------------------------------------
print("\n[IMPORT CHECKS]")

# 2.1 Loaders / config
from finrag_ml_tg1.loaders.ml_config_loader import MLConfig

# 2.2 Entity adapter package - ALL modules
from finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter import EntityAdapter
from finrag_ml_tg1.rag_modules_src.entity_adapter.company_extractor import CompanyExtractor
from finrag_ml_tg1.rag_modules_src.entity_adapter.company_universe import CompanyUniverse
from finrag_ml_tg1.rag_modules_src.entity_adapter.section_extractor import SectionExtractor
from finrag_ml_tg1.rag_modules_src.entity_adapter.section_universe import SectionUniverse
from finrag_ml_tg1.rag_modules_src.entity_adapter.year_extractor import YearExtractor
from finrag_ml_tg1.rag_modules_src.entity_adapter.metric_adapter import MetricAdapter
from finrag_ml_tg1.rag_modules_src.entity_adapter.models import (
    CompanyInfo,
    CompanyMatches,
    YearMatches,
    MetricMatches,
    SectionMatches,
    RiskMatches,
)
from finrag_ml_tg1.rag_modules_src.entity_adapter.string_utils import simple_fuzzy_match

# 2.3 Metric pipeline - ALL modules
from finrag_ml_tg1.rag_modules_src.metric_pipeline.src.filter_extractor import FilterExtractor
from finrag_ml_tg1.rag_modules_src.metric_pipeline.src.metric_lookup import MetricLookup
from finrag_ml_tg1.rag_modules_src.metric_pipeline.src.pipeline import MetricPipeline
from finrag_ml_tg1.rag_modules_src.metric_pipeline.config.metric_mappings import (
    METRIC_MAPPINGS,
    METRIC_KEYWORDS,
    QUANTITATIVE_INDICATORS,
)

# 2.4 Constants (used by entity_adapter and metric_pipeline)
from finrag_ml_tg1.rag_modules_src.constants.metric_mapping_v2 import (
    METRIC_MAPPINGS as METRIC_MAPPINGS_V2,
    SECTION_KEYWORDS,
    SECTION_ITEM_PATTERNS,
    RISK_TOPIC_KEYWORDS,
)

# 2.5 Utilities
from finrag_ml_tg1.rag_modules_src.utilities.query_embedder_v2 import (
    EmbeddingRuntimeConfig,
    QueryEmbedderV2,
    QueryTooLongError,
    QueryTooShortError,
    QueryOutOfScopeError,
)

# 2.6 Synthesis / orchestration
from finrag_ml_tg1.rag_modules_src.synthesis_pipeline.bedrock_client import BedrockClient
from finrag_ml_tg1.rag_modules_src.synthesis_pipeline import orchestrator as synth_orchestrator

print("  ✓ All module imports succeeded")

# -------------------------------------------------------------------
# 3. Compute all data paths from model_root (once, at top level)
# -------------------------------------------------------------------
print("\n[PATH RESOLUTION]")

# Dimension files
DIM_COMPANIES = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_companies_21.parquet"
DIM_SECTIONS = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_sec_sections.parquet"

# Metric pipeline data
METRIC_DATA_JSON = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/downloaded_data.json"

# Verify critical files exist
assert DIM_COMPANIES.exists(), f"Missing: {DIM_COMPANIES}"
assert DIM_SECTIONS.exists(), f"Missing: {DIM_SECTIONS}"
assert METRIC_DATA_JSON.exists(), f"Missing: {METRIC_DATA_JSON}"

print(f"  ✓ Company dimension: {DIM_COMPANIES.name}")
print(f"  ✓ Section dimension: {DIM_SECTIONS.name}")
print(f"  ✓ Metric data JSON: {METRIC_DATA_JSON.name}")

# -------------------------------------------------------------------
# 4. Object construction smoke tests (no external calls)
# -------------------------------------------------------------------
print("\n[CONSTRUCTION CHECKS]")

# 4.1 MLConfig (loads YAML + env)
config = check("MLConfig()", lambda: MLConfig())

# 4.2 EmbeddingRuntimeConfig from MLConfig
def _make_runtime_cfg():
    embedding_cfg_dict = config.cfg["embedding"]
    cfg = EmbeddingRuntimeConfig.from_ml_config(embedding_cfg_dict)
    cfg.max_query_chars = 1500
    return cfg

runtime_cfg = check("EmbeddingRuntimeConfig.from_ml_config()", _make_runtime_cfg)

# 4.3 EntityAdapter (with explicit paths)
adapter = check(
    "EntityAdapter(company_dim_path, section_dim_path)",
    lambda: EntityAdapter(
        company_dim_path=DIM_COMPANIES,
        section_dim_path=DIM_SECTIONS,
    ),
)

# 4.4 MetricPipeline (with explicit paths)
metric_pipeline = check(
    "MetricPipeline(data_path, company_dim_path)",
    lambda: MetricPipeline(
        data_path=str(METRIC_DATA_JSON),
        company_dim_path=str(DIM_COMPANIES),
    ),
)

# 4.5 Bedrock runtime client from MLConfig
bedrock_runtime_client = check(
    "config.get_bedrock_client()",
    lambda: config.get_bedrock_client(),
)

# 4.6 QueryEmbedderV2 (using shared Bedrock client)
embedder = check(
    "QueryEmbedderV2(runtime_cfg, boto_client)",
    lambda: QueryEmbedderV2(runtime_cfg, boto_client=bedrock_runtime_client),
)

# 4.7 BedrockClient (LLM client for synthesis)
def _make_llm_client():
    llm_cfg = config.cfg.get("rag_orchestrator", {}).get("llm", {})
    return BedrockClient(
        region=llm_cfg.get("region", config.region),
        model_id=llm_cfg.get("model_id", "anthropic.claude-3-sonnet-20240229-v1:0"),
        max_tokens=llm_cfg.get("max_tokens", 4096),
        temperature=llm_cfg.get("temperature", 0.7),
    )

llm_client = check("BedrockClient (synthesis LLM)", _make_llm_client)

# -------------------------------------------------------------------
# 5. Summary
# -------------------------------------------------------------------
print("\n[SUMMARY]")
print("  ✓ All imports validated")
print("  ✓ All dimension files located")
print("  ✓ All core objects constructed successfully")
print("  ✓ No AWS/Bedrock calls were made in this cell")
print("\nReady for functional tests in subsequent cells.")

✓ Model root on sys.path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline

[IMPORT CHECKS]
  ✓ All module imports succeeded

[PATH RESOLUTION]
  ✓ Company dimension: finrag_dim_companies_21.parquet
  ✓ Section dimension: finrag_dim_sec_sections.parquet
  ✓ Metric data JSON: downloaded_data.json

[CONSTRUCTION CHECKS]
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env
  ✓ MLConfig()
  ✓ EmbeddingRuntimeConfig.from_ml_config()
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
  ✓ EntityAdapter(company_dim_path, section_dim_path)
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ Loaded 527 metric records
✓ Unique tickers: 2
✓ Year range: 2010-2025
  ✓ MetricPipeline(data_path, company_dim_path)
  ✓ config.get_bedrock_client()
  ✓ QueryEmbedderV2(runtime_cfg, boto_client)
  ✓ BedrockClient (synthesis LLM)

[SUMMARY]
  ✓ All imports validated
  ✓ All dimension files locat

In [4]:
# CELL 2 — Functional smoke tests (no AWS calls)

print("[FUNCTIONAL TESTS]")
print("Testing core business logic of each pipeline component...\n")

# -------------------------------------------------------------------
# Test 1: EntityAdapter - Full extraction pipeline
# -------------------------------------------------------------------
print("1. EntityAdapter.extract()")
test_query = "What was Apple's revenue in 2023?"
result = adapter.extract(test_query)

assert len(result.companies.tickers) > 0, "Should extract Apple ticker"
assert 2023 in result.years.years, "Should extract year 2023"
assert len(result.metrics.metrics) > 0, "Should extract revenue metric"
print(f"   ✓ Extracted: {result.companies.tickers}, {result.years.years}, {result.metrics.metrics[:2]}")

# -------------------------------------------------------------------
# Test 2: MetricPipeline - needs_metric_layer decision logic
# -------------------------------------------------------------------
print("\n2. MetricPipeline.needs_metric_layer()")
quantitative_query = "What is NVDA's revenue in 2023?"
narrative_query = "Tell me about AI trends"

assert metric_pipeline.needs_metric_layer(quantitative_query) == True, "Should activate for quantitative query"
assert metric_pipeline.needs_metric_layer(narrative_query) == False, "Should skip for narrative query"
print(f"   ✓ Correctly routes quantitative vs narrative queries")

# -------------------------------------------------------------------
# Test 3: MetricPipeline - Full process() flow (with actual data lookup)
# -------------------------------------------------------------------
print("\n3. MetricPipeline.process()")
metric_query = "What was NVDA revenue in 2023?"
metric_result = metric_pipeline.process(metric_query)

# Check the actual return structure
assert 'success' in metric_result, "Should have 'success' key"
assert 'query' in metric_result, "Should have 'query' key"

if metric_result['success']:
    assert 'filters' in metric_result, "Should have 'filters' on success"
    assert 'data' in metric_result, "Should have 'data' on success"
    assert 'count' in metric_result, "Should have 'count' on success"
    
    print(f"   ✓ Success! Filters: {metric_result['filters']['tickers']}, {metric_result['filters']['years']}, {metric_result['filters']['metrics'][:2]}")
    print(f"   ✓ Data points returned: {metric_result['count']} records")
    print(f"   ✓ Stats: {metric_result['stats']['found_with_values']}/{metric_result['stats']['total_combinations']} combinations found")
else:
    print(f"   ⚠ Metric layer not activated: {metric_result.get('reason', 'Unknown reason')}")
    if 'extracted_filters' in metric_result:
        print(f"   Extracted: {metric_result['extracted_filters']}")


# -------------------------------------------------------------------
# Test 4: QueryEmbedderV2 - Validation logic (no Bedrock call)
# -------------------------------------------------------------------
print("\n4. QueryEmbedderV2 validation")
from finrag_ml_tg1.rag_modules_src.utilities.query_embedder_v2 import (
    QueryTooLongError,
)

# Test too-long query (this is what validate_query actually checks)
try:
    long_query = "a" * 2000  # Exceeds max_query_chars (1500)
    embedder.validate_query(long_query)
    assert False, "Should reject too-long query"
except QueryTooLongError:
    print("   ✓ Correctly rejects too-long queries (>1500 chars)")

# Test valid query length
valid_query = "What was Apple's revenue in 2023?"
embedder.validate_query(valid_query)  # Should not raise
print("   ✓ Accepts valid query length")

# Note: QueryEmbedderV2.validate_query() only checks LENGTH
# Entity/scope validation happens in validate_scope() which needs EntityExtractionResult
# We skip testing validate_scope() in this smoke test since it requires entity extraction





# Test too-long query
try:
    long_query = "a" * 2000  # Exceeds max_query_chars (1500)
    embedder.validate_query(long_query)
    assert False, "Should reject too-long query"
except QueryTooLongError:
    print("   ✓ Correctly rejects too-long queries")

# Test valid query length
valid_query = "What was Apple's revenue in 2023?"
embedder.validate_query(valid_query)  # Should not raise
print("   ✓ Accepts valid query length")



# -------------------------------------------------------------------
# Test 5: CompanyUniverse - Lookup operations
# -------------------------------------------------------------------
print("\n5. CompanyUniverse lookups")

# Test ticker lookups (just verify they exist, don't check exact names)
aapl = adapter.company_universe.get_by_ticker("AAPL")
assert aapl is not None, "Should find Apple by ticker"
assert aapl.cik_int == 320193, f"Expected CIK 320193 for AAPL, got {aapl.cik_int}"
print(f"   ✓ AAPL lookup: {aapl.name} (CIK: {aapl.cik_int})")

nvda = adapter.company_universe.get_by_ticker("NVDA")
assert nvda is not None, "Should find NVIDIA by ticker"
assert nvda.cik_int == 1045810, f"Expected CIK 1045810 for NVDA, got {nvda.cik_int}"
print(f"   ✓ NVDA lookup: {nvda.name} (CIK: {nvda.cik_int})")

# Test CIK lookup
apple_by_cik = adapter.company_universe.get_by_cik_int(320193)
assert apple_by_cik is not None, "Should find Apple by CIK"
assert apple_by_cik.ticker == "AAPL", f"Expected ticker AAPL, got {apple_by_cik.ticker}"
print(f"   ✓ CIK 320193 lookup: {apple_by_cik.ticker}")

# Test that universe has expected number of companies (21 based on your JSON)
total_companies = len(adapter.company_universe.ciks_int)
assert total_companies == 21, f"Expected 21 companies, got {total_companies}"
print(f"   ✓ Universe contains {total_companies} companies")



# -------------------------------------------------------------------
# Test 6: SectionUniverse - Canonical section validation
# -------------------------------------------------------------------
print("\n6. SectionUniverse validation")

# Check some core sections exist
assert adapter.section_universe.has("ITEM_7"), "Should have ITEM_7"
assert adapter.section_universe.has("ITEM_1A"), "Should have ITEM_1A"
assert adapter.section_universe.has("ITEM_1"), "Should have ITEM_1"
print(f"   ✓ Core sections (ITEM_1, ITEM_1A, ITEM_7) validated")

# Check rejection of invalid section
assert not adapter.section_universe.has("INVALID_SECTION"), "Should reject invalid section"
print(f"   ✓ Rejects invalid section codes")

# Test filtering
test_sections = ["ITEM_7", "INVALID", "ITEM_1A", "FAKE_ITEM"]
valid_sections = adapter.section_universe.filter_existing(test_sections)
assert "ITEM_7" in valid_sections, "Should keep ITEM_7"
assert "ITEM_1A" in valid_sections, "Should keep ITEM_1A"
assert "INVALID" not in valid_sections, "Should remove INVALID"
assert "FAKE_ITEM" not in valid_sections, "Should remove FAKE_ITEM"
print(f"   ✓ filter_existing() works: {len(valid_sections)}/4 sections kept")

# Check total section count (informational, not strict assertion)
total_sections = len(adapter.section_universe.all_canonical)
print(f"   ✓ Section universe loaded: {total_sections} valid sections")



# -------------------------------------------------------------------
# Summary
# -------------------------------------------------------------------
print("\n" + "="*60)
print("[FUNCTIONAL TEST SUMMARY]")
print("  ✓ EntityAdapter: Full NL→entity extraction working")
print("  ✓ MetricPipeline: Query routing + data lookup working")
print("  ✓ QueryEmbedderV2: Validation logic working")
print("  ✓ CompanyUniverse: Ticker/CIK lookups working")
print("  ✓ SectionUniverse: Section validation working")
print("="*60)
print("\nAll functional smoke tests passed!")
print("Safe to proceed with AWS-dependent tests (embeddings, vector search, LLM calls).")

[FUNCTIONAL TESTS]
Testing core business logic of each pipeline component...

1. EntityAdapter.extract()
   ✓ Extracted: ['AAPL'], [2023], ['income_stmt_Revenue']

2. MetricPipeline.needs_metric_layer()
   ✓ Correctly routes quantitative vs narrative queries

3. MetricPipeline.process()
   ✓ Success! Filters: ['NVDA'], [2023], ['income_stmt_Revenue']
   ✓ Data points returned: 1 records
   ✓ Stats: 1/1 combinations found

4. QueryEmbedderV2 validation
   ✓ Correctly rejects too-long queries (>1500 chars)
   ✓ Accepts valid query length
   ✓ Correctly rejects too-long queries
   ✓ Accepts valid query length

5. CompanyUniverse lookups
   ✓ AAPL lookup: Apple Inc. (CIK: 320193)
   ✓ NVDA lookup: NVIDIA CORP (CIK: 1045810)
   ✓ CIK 320193 lookup: AAPL
   ✓ Universe contains 21 companies

6. SectionUniverse validation
   ✓ Core sections (ITEM_1, ITEM_1A, ITEM_7) validated
   ✓ Rejects invalid section codes
   ✓ filter_existing() works: 2/4 sections kept
   ✓ Section universe loaded: 21 val

```
Query → Extract Entities → Validate/Embed → Get KPI Data → Format → Display
  ↓           ↓                 ↓              ↓           ↓        ↓
adapter    entities       (optional)      pipeline    formatter  notebook
```

#### CELL 3 — Supply Line 1: Query → Entities → KPI Data → Formatted Output
#### CELL 3 SUPPLY LINE 1: v2.

In [5]:
# CELL 3 — Supply Line 1: Query → Entities → KPI Data → Formatted Output
print("[SUPPLY LINE 1 - KPI FACT TABLE PIPELINE]")
print("Testing: user_query → EntityAdapter → MetricPipeline → formatted text\n")

from finrag_ml_tg1.rag_modules_src.utilities.supply_line_formatters import ( format_analytical_compact, format_value_compact, )
from finrag_ml_tg1.rag_modules_src.metric_pipeline.src.pipeline import MetricPipeline

METRIC_DATA_JSON = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/downloaded_data.json"
DIM_COMPANIES = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_companies_21.parquet"
DIM_SECTIONS = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_sec_sections.parquet"

metric_pipeline = MetricPipeline( data_path=str(METRIC_DATA_JSON), company_dim_path=str(DIM_COMPANIES), )
print("✓ MetricPipeline initialized\n")

# Test Query
test_query = "What were Microsoft's and NVIDIA's total assets and revenue from 2021 to 2023?"
print(f"Query: {test_query}\n")

print("[Step 1: Entity Extraction]")
entities = adapter.extract(test_query)
print(f"  Companies: {entities.companies.tickers}")
print(f"  Years: {entities.years.years}")
print(f"  Metrics: {entities.metrics.metrics[:3]}...")  # Show first 3

print("\n[Step 2: KPI Fact Table Lookup]")
pipeline_result = metric_pipeline.process(test_query)

if pipeline_result['success']:
    print(f"  ✓ Success: {pipeline_result['count']} data points")
    print(f"  Coverage: {pipeline_result['stats']['found_with_values']}/{pipeline_result['stats']['total_combinations']}")
else:
    print(f"  ✗ Failed: {pipeline_result.get('reason', 'Unknown error')}")

print("\n[Step 3: Format Output]")
compact_kpi_data = format_analytical_compact(pipeline_result)

if compact_kpi_data:
    print("Compact KPI Data:")
    print("-" * 70)
    print(compact_kpi_data)
    print("-" * 70)
    
    # Token estimate (useful for LLM context planning)
    token_estimate = len(compact_kpi_data) // 4
    print(f"\nToken estimate: ~{token_estimate} tokens")
else:
    print("(No KPI data available)")

# -------------------------------------------------------------------
# Summary
# -------------------------------------------------------------------
print("\n[SUPPLY LINE 1 COMPLETE]")
print("✓ Entity extraction working")
print("✓ KPI fact table lookup working")
print("✓ Formatting utilities working")

[SUPPLY LINE 1 - KPI FACT TABLE PIPELINE]
Testing: user_query → EntityAdapter → MetricPipeline → formatted text

✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ Loaded 527 metric records
✓ Unique tickers: 2
✓ Year range: 2010-2025
✓ MetricPipeline initialized

Query: What were Microsoft's and NVIDIA's total assets and revenue from 2021 to 2023?

[Step 1: Entity Extraction]
  Companies: ['MSFT', 'NVDA']
  Years: [2021, 2022, 2023]
  Metrics: ['balance_sheet_Total Assets', 'income_stmt_Revenue']...

[Step 2: KPI Fact Table Lookup]
  ✓ Success: 8 data points
  Coverage: 8/12

[Step 3: Format Output]
Compact KPI Data:
----------------------------------------------------------------------
MSFT 2021: Total Assets=$333.8B
MSFT 2022: Total Assets=$364.8B
MSFT 2023: Total Assets=$412.0B
NVDA 2021: Total Assets=$28.8B
NVDA 2022: Total Assets=$44.2B, Revenue=$26.9B
NVDA 2023: Total Assets=$41.2B, Revenue=$27.0B
-------------------------------------------

In [None]:
from pathlib import Path
import sys

# 1. Put ModelPipeline on sys.path
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))
print(f"✓ Model root on sys.path: {model_root}")


## SUPPLY LINE 1: ENTITY-RESULT CHAINING. DEMO. Query → Extract Entities → Validate/Embed → Get KPI Data → Format → Display
from finrag_ml_tg1.rag_modules_src.utilities.supply_line_formatters import format_analytical_compact
from finrag_ml_tg1.rag_modules_src.metric_pipeline.src.pipeline import MetricPipeline
from finrag_ml_tg1.rag_modules_src.entity_adapter.entity_adapter import EntityAdapter

METRIC_DATA_JSON = model_root / "finrag_ml_tg1/rag_modules_src/metric_pipeline/data/downloaded_data.json"
DIM_COMPANIES = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_companies_21.parquet"
DIM_SECTIONS = model_root / "finrag_ml_tg1/data_cache/dimensions/finrag_dim_sec_sections.parquet"

adapter =  EntityAdapter( company_dim_path=DIM_COMPANIES, section_dim_path=DIM_SECTIONS )

metric_pipeline = MetricPipeline(data_path=str(METRIC_DATA_JSON), company_dim_path=str(DIM_COMPANIES))

query = "What were Microsoft's and NVIDIA's total assets and revenue from 2021 to 2023?"
entities = adapter.extract(query)
result = metric_pipeline.process(query)
compact = format_analytical_compact(result)

print(f"KPI Data:\n{'-'*70}\n{compact or '(no data)'}\n{'-'*70}")

✓ Model root on sys.path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ Loaded 527 metric records
✓ Unique tickers: 2
✓ Year range: 2010-2025
KPI Data:
----------------------------------------------------------------------
MSFT 2021: Total Assets=$333.8B
MSFT 2022: Total Assets=$364.8B
MSFT 2023: Total Assets=$412.0B
NVDA 2021: Total Assets=$28.8B
NVDA 2022: Total Assets=$44.2B, Revenue=$26.9B
NVDA 2023: Total Assets=$41.2B, Revenue=$27.0B
----------------------------------------------------------------------


In [1]:

from pathlib import Path
import sys

current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")
if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))
print(f"✓ Model root on sys.path: {model_root}")


# Test the updated extractor
from finrag_ml_tg1.rag_modules_src.entity_adapter.year_extractor import YearExtractor

extractor = YearExtractor()

test_phrases = [
    "between 2020 and 2023",
    "from 2017 through 2020",
    "2015 till 2019",
    "during 2020-2023",
    "revenue in 2021, 2022, and 2023",
]

for phrase in test_phrases:
    result = extractor.extract(phrase)
    print(f"{phrase:<40} → {result.years}")

✓ Model root on sys.path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
between 2020 and 2023                    → [2020, 2021, 2022, 2023]
from 2017 through 2020                   → [2017, 2018, 2019, 2020]
2015 till 2019                           → [2015, 2016, 2017, 2018, 2019]
during 2020-2023                         → [2020, 2021, 2022, 2023]
revenue in 2021, 2022, and 2023          → [2021, 2022, 2023]
