## LLM Answer Evals: 
#### Eval suite on final GOLDP3 Answers - LLM Synthesis using BERT Score and ROUGE.

In [2]:
# ============================================================================
# CELL 1: Setup - Path Resolution & Imports
# ============================================================================

from pathlib import Path
import sys
import logging

# Suppress noisy logs for clean notebook output
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger("finrag_ml_tg1").setLevel(logging.INFO)

# Find ModelPipeline root and add to sys.path
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")

if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"ModelPipeline root: {model_root}")
print(f"Notebook location: {Path.cwd()}")

ModelPipeline root: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
Notebook location: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\rag_modules_src\01_Isolation_Test_NBS


In [3]:
# ============================================================================
# One-Time Setup: Download BLEURT-20 Checkpoint
# ============================================================================
import urllib.request
import zipfile
from pathlib import Path

bleurt_cache = Path.home() / ".cache" / "bleurt"
checkpoint_dir = bleurt_cache / "BLEURT-20"

if not checkpoint_dir.exists():
    print("Downloading BLEURT-20 checkpoint (~1GB, one-time only)...")
    
    # Download
    url = "https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip"
    zip_path = bleurt_cache / "BLEURT-20.zip"
    bleurt_cache.mkdir(parents=True, exist_ok=True)
    
    urllib.request.urlretrieve(url, zip_path)
    print(f"Downloaded to: {zip_path}")
    
    # Extract
    print("Extracting checkpoint...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(bleurt_cache)
    
    # Cleanup
    zip_path.unlink()
    print(f"✓ Checkpoint installed at: {checkpoint_dir}")
else:
    print(f"✓ BLEURT-20 checkpoint already exists at: {checkpoint_dir}")

✓ BLEURT-20 checkpoint already exists at: C:\Users\joems\.cache\bleurt\BLEURT-20


In [4]:
## ===========================================================================
## Cache Inspection - HuggingFace Model Cache Size
## ===========================================================================

import os
from pathlib import Path

# HuggingFace cache location
cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
print(f"Cache directory: {cache_dir}")

# List cached models
for model_dir in cache_dir.glob("models--*"):
    model_name = model_dir.name.replace("models--", "").replace("--", "/")
    size_gb = sum(f.stat().st_size for f in model_dir.rglob("*") if f.is_file()) / 1e9
    print(f"{model_name}: {size_gb:.2f} GB")

Cache directory: C:\Users\joems\.cache\huggingface\hub
bert-base-uncased: 0.00 GB
deepset/roberta-base-squad2: 0.50 GB
distilbert-base-cased-distilled-squad: 0.26 GB
distilbert-base-uncased: 0.00 GB
mrm8488/bert-small-finetuned-squadv2: 0.23 GB
roberta-large: 1.42 GB
sentence-transformers/all-MiniLM-L6-v2: 0.09 GB
sentence-transformers/all-mpnet-base-v2: 0.44 GB


In [5]:
# ============================================================================
# CELL 2: Initialize Evaluation Stack
# ============================================================================
"""
Loads all evaluation models. Uses cached models from previous downloads.

Models:
- DeBERTa-XLarge-MNLI (1.4GB) - BERTScore
- all-MiniLM-L6-v2 (80MB) - Cosine Similarity  
- BLEURT-20 (1GB) - BLEURT scores
"""

print("="*80)
print("INITIALIZING EVALUATION STACK")
print("="*80)

# 1. BERTScore
print("\n1/4 Loading BERTScore (DeBERTa-XLarge)...")
from bert_score import score
P, R, F1 = score(["test"], ["test"], lang='en', verbose=False)
print(f"    ✓ BERTScore ready (F1: {F1.item():.3f})")

# 2. Sentence Transformer
print("\n2/4 Loading Sentence Transformer (MiniLM)...")
from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
test_emb = sentence_model.encode("test")
print(f"    ✓ Sentence Transformer ready (dim: {len(test_emb)})")

# 3. ROUGE
print("\n3/4 Loading ROUGE scorer...")
from rouge_score import rouge_scorer
rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
test_rouge = rouge_scorer_obj.score("test", "test")
print(f"    ✓ ROUGE ready (F1: {test_rouge['rougeL'].fmeasure:.3f})")

# 4. BLEURT
print("\n4/4 Loading BLEURT-20...")
from bleurt import score as bleurt_score
checkpoint_path = str(Path.home() / ".cache" / "bleurt" / "BLEURT-20")
bleurt_scorer = bleurt_score.BleurtScorer(checkpoint_path)  # ← Pass full path, not just 'BLEURT-20'
test_bleurt = bleurt_scorer.score(references=["test"], candidates=["test"])
print(f"    ✓ BLEURT ready (score: {test_bleurt[0]:.3f})")

print("\n" + "="*80)
print("ALL MODELS LOADED - EVALUATION STACK READY")
print("="*80)
print("\nReady to evaluate answers!")

INITIALIZING EVALUATION STACK

1/4 Loading BERTScore (DeBERTa-XLarge)...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


    ✓ BERTScore ready (F1: 1.000)

2/4 Loading Sentence Transformer (MiniLM)...

    ✓ Sentence Transformer ready (dim: 384)

3/4 Loading ROUGE scorer...
    ✓ ROUGE ready (F1: 1.000)

4/4 Loading BLEURT-20...






INFO:tensorflow:Reading checkpoint C:\Users\joems\.cache\bleurt\BLEURT-20.


INFO:tensorflow:Reading checkpoint C:\Users\joems\.cache\bleurt\BLEURT-20.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: C:\Users\joems\.cache\bleurt\BLEURT-20\sent_piece.model.


INFO:tensorflow:Will load model: C:\Users\joems\.cache\bleurt\BLEURT-20\sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


    ✓ BLEURT ready (score: 0.598)

ALL MODELS LOADED - EVALUATION STACK READY

Ready to evaluate answers!


In [6]:
## ===========================================================================
## Proper DEMO test for BlEURT loading
"""
BLEURT Score Interpretation Guide:
Excellent synthesis: 0.6 - 0.9
Good synthesis: 0.3 - 0.6
Weak synthesis: 0.0 - 0.3
Poor synthesis: Below 0.0
"""
## ===========================================================================

# Test with actual sentences
good_match = bleurt_scorer.score(
    references=["The company's revenue increased significantly"],
    candidates=["The firm's revenue grew substantially"]
)
print(f"Good paraphrase: {good_match[0]:.3f}")  # Should be ~0.8-0.9

bad_match = bleurt_scorer.score(
    references=["The company's revenue increased significantly"],
    candidates=["I like pizza and ice cream"]
)
print(f"Unrelated text: {bad_match[0]:.3f}")  # Should be negative

Good paraphrase: 0.833
Unrelated text: 0.184


---
## Eval !
---

In [7]:
# ============================================================================
# CELL 1: Setup - Path Resolution & Load Gold Test Suite
# ============================================================================

from pathlib import Path
import sys
import logging
import json

# Suppress noisy logs for clean notebook output
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger("finrag_ml_tg1").setLevel(logging.INFO)

# Find ModelPipeline root and add to sys.path
current = Path.cwd()
for parent in [current] + list(current.parents):
    if parent.name == "ModelPipeline":
        model_root = parent
        break
else:
    raise RuntimeError("Cannot find 'ModelPipeline' root in path tree")

if str(model_root) not in sys.path:
    sys.path.insert(0, str(model_root))

print(f"✓ ModelPipeline root: {model_root}")
print(f"✓ Notebook location: {Path.cwd()}\n")

# Construct absolute path to gold test suite
gold_path = model_root / "finrag_ml_tg1" / "data_cache" / "qa_manual_exports" / "goldp3_analysis" / "p3_gold_test_suite_31q.json"

if not gold_path.exists():
    raise FileNotFoundError(f"Gold test suite not found at: {gold_path}")

print(f"✓ Gold test suite: {gold_path}\n")

# Load all questions
with gold_path.open("r", encoding="utf-8") as f:
    all_questions = json.load(f)

# Selected question IDs for testing
SELECTED_IDS = [
    "P3V3-Q001",  # Walmart Debt Strategy 2018-2020 (cross-year, medium, 4 evidence)
    "P3V2-Q006",  # Microsoft Intelligent Cloud 2017 (local, medium, 1 evidence)
    "P3V3-Q004",  # Cross-Company Cyber 2009 (cross-company, medium, 3 evidence)
    "P3V3-Q007",  # Tesla Adjusted EBITDA 2022 (local, easy, 1 evidence)
    "P3V3-Q002",  # Meta Regulatory Evolution 2019-2024 (cross-year, hard, 4 evidence)


    "P3V2-Q015",  # Walmart Market/Competitive Risks 2021
                  # local, medium, 1 evidence # COVID-related risks - comprehensive but single long paragraph
    "P3V2-Q007",  # Genworth Regulatory Risks 2019
                  # local, medium, 1 evidence # Massive bullet list (14+ risk cues) - overwhelming detail
    "P3V2-Q013",  # Walmart Operational/Supply Chain Risks 2011
                  # local, hard, 1 evidence # Long narrative about natural disasters and disruptions
    
    "P3V2-Q001",  # Exxon Mobil Total Revenue 2008
                  # local, easy, 1 evidence
                  # BAD: Asks for revenue, answer is cross-reference boilerplate
    "P3V2-Q002",  # Eli Lilly Net Income 2006
                  # local, easy, 1 evidence
                  # BAD: Asks for net income, answer discusses valuation allowance
    "P3V2-Q004",  # Johnson & Johnson Cash Flow 2016
                  # local, easy, 1 evidence
                  # BAD: Asks for cash flow, answer is auditor's opinion statement
]

# Extract selected questions into structured dictionary
test_suite = {}
for q in all_questions:
    qid = q["question_id"]
    if qid in SELECTED_IDS:
        test_suite[qid] = {
            "question_text": q["question_text"],
            "gold_answer": q["answer_text"],
            "answer_type": q["answer_type"],
            "companies": q["company_name"],
            "years": q["years"],
            "retrieval_scope": q["retrieval_scope"],
            "difficulty": q["difficulty"],
            "evidence_count": len(q["evidence_sentence_ids"]),
            "evidence_ids": q["evidence_sentence_ids"],
        }

# Display summary
print("="*80)
print(f"LOADED {len(test_suite)} TEST QUESTIONS")
print("="*80)
for qid in SELECTED_IDS:
    q = test_suite[qid]
    print(f"\n{qid}:")
    print(f"  Companies: {', '.join(q['companies'])}")
    print(f"  Years: {q['years']}")
    print(f"  Scope: {q['retrieval_scope']} | Difficulty: {q['difficulty']} | Evidence: {q['evidence_count']} sentences")

✓ ModelPipeline root: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline
✓ Notebook location: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\rag_modules_src\01_Isolation_Test_NBS

✓ Gold test suite: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 Project\FinSights\ModelPipeline\finrag_ml_tg1\data_cache\qa_manual_exports\goldp3_analysis\p3_gold_test_suite_31q.json

LOADED 11 TEST QUESTIONS

P3V3-Q001:
  Companies: Walmart Inc.
  Years: [2018, 2019, 2020]
  Scope: cross_year | Difficulty: medium | Evidence: 4 sentences

P3V2-Q006:
  Companies: MICROSOFT CORP
  Years: [2017]
  Scope: local | Difficulty: medium | Evidence: 1 sentences

P3V3-Q004:
  Companies: RADIAN GROUP INC, NETFLIX INC, Mastercard Inc
  Years: [2009]
  Scope: cross_company | Difficulty: medium | Evidence: 3 sentences

P3V3-Q007:
  Companies: Tesla, Inc.
  Years: [2022]
  Scope: local | Difficulty: easy | Evidence: 1 sentences

P3V3-Q002:
 

In [8]:
# ============================================================================
# CELL 3: Batch Evaluation - Run Selected Questions & Compute Metrics
# ============================================================================

# CRITICAL: Suppress verbose logging for clean output
import logging
logging.getLogger("finrag_ml_tg1").setLevel(logging.WARNING)
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)

import polars as pl
import time
from finrag_ml_tg1.rag_modules_src.synthesis_pipeline.orchestrator import answer_query
from finrag_ml_tg1.rag_modules_src.utilities.evaluation_metrics import evaluate_answer

# Configure which questions to test
TEST_QUESTION_IDS = [
    # TIER 1: EXCELLENT QUESTIONS (5)
    "P3V3-Q001",  # Walmart Debt Strategy 2018-2020 (cross-year, medium, 4 evidence)
    "P3V2-Q006",  # Microsoft Intelligent Cloud 2017 (local, medium, 1 evidence)
    "P3V3-Q004",  # Cross-Company Cyber 2009 (cross-company, medium, 3 evidence)
    "P3V3-Q007",  # Tesla Adjusted EBITDA 2022 (local, easy, 1 evidence)
    "P3V3-Q002",  # Meta Regulatory Evolution 2019-2024 (cross-year, hard, 4 evidence)
    
    # TIER 3: BAD QUESTION (1 - for comparison)
    "P3V2-Q001",  # Exxon Revenue 2008 (BAD: cross-reference answer, likely missing data)
]

# Rate limit protection
SLEEP_BETWEEN_QUERIES = 70  # 70 seconds = safe for any rate limit

print("="*80)
print(f"BATCH EVALUATION: {len(TEST_QUESTION_IDS)} QUESTIONS")
print("="*80)
print(f"Model: Claude Sonnet 4.5")
print(f"Rate limit protection: {SLEEP_BETWEEN_QUERIES}s delay between queries")
print(f"Estimated total runtime: ~{(len(TEST_QUESTION_IDS) * SLEEP_BETWEEN_QUERIES + len(TEST_QUESTION_IDS) * 10)/60:.1f} min\n")

results = []

for i, qid in enumerate(TEST_QUESTION_IDS, 1):
    print(f"\n[{i}/{len(TEST_QUESTION_IDS)}] Processing {qid}...")
    
    # Get question data
    question_data = test_suite[qid]
    query = question_data["question_text"]
    gold_answer = question_data["gold_answer"]
    
    # Run synthesis
    print(f"  - Running synthesis...")
    result = answer_query(
        query=query,
        model_root=model_root,
        include_kpi=True,
        include_rag=True,
        model_key="development_CL_SONN_4_5",  # Sonnet 4.5
        export_context=True,
        export_response=True
    )
    
    # Extract answer
    if result.get('error'):
        print(f"  - ERROR: {result['error']}")
        continue
    
    llm_answer = result['answer']
    llm_meta = result['metadata']['llm']
    
    # Compute evaluation metrics
    print(f"  - Computing metrics...")
    eval_scores = evaluate_answer(
        gold_answer=gold_answer,
        synthesis_answer=llm_answer,
        include_bleurt=True,
        include_timing=True
    )
    
    # Collect results
    results.append({
        "question_id": qid,
        "company": ", ".join(question_data["companies"]),
        "scope": question_data["retrieval_scope"],
        "difficulty": question_data["difficulty"],
        "rouge_l": eval_scores["rouge_l"],
        "bertscore_f1": eval_scores["bertscore_f1"],
        "cosine_sim": eval_scores["cosine_sim"],
        "bleurt": eval_scores["bleurt"],
        "interpretation": eval_scores["interpretation"],
        "eval_time_ms": eval_scores["timing"]["total_ms"],
        "synthesis_tokens": llm_meta["total_tokens"],
        "synthesis_cost": llm_meta["cost"],
    })
    
    print(f"  ✓ BERTScore: {eval_scores['bertscore_f1']:.3f} | BLEURT: {eval_scores['bleurt']:.3f} | Time: {eval_scores['timing']['total_ms']/1000:.1f}s")
    
    # Rate limit protection (skip after last question)
    if i < len(TEST_QUESTION_IDS):
        print(f"  - Cooling down ({SLEEP_BETWEEN_QUERIES}s)...")
        time.sleep(SLEEP_BETWEEN_QUERIES)

# Create DataFrame
df_results = pl.DataFrame(results)

print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80 + "\n")

# Display results
df_results


BATCH EVALUATION: 6 QUESTIONS
Model: Claude Sonnet 4.5
Rate limit protection: 70s delay between queries
Estimated total runtime: ~8.0 min


[1/6] Processing P3V3-Q001...
  - Running synthesis...
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ KPI-JSON: Loaded 527 metric records
✓ KPI-JSON: Unique tickers: 2
✓ KPI-JSON: Year range: 2010-2025
  - Computing metrics...
  ✓ BERTScore: 0.842 | BLEURT: 0.501 | Time: 7.7s
  - Cooling down (70s)...

[2/6] Processing P3V2-Q006...
  - Running synthesis...
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env
[DEBUG] ✓ AWS credentials loaded from aws_credentials.env
✓ FilterExtractor initialized with 21 companies
  Using: finrag_dim_companies_21.parquet
✓ FilterExtractor initialized with 21 companies

question_id,company,scope,difficulty,rouge_l,bertscore_f1,cosine_sim,bleurt,interpretation,eval_time_ms,synthesis_tokens,synthesis_cost
str,str,str,str,f64,f64,f64,f64,str,f64,i64,f64
"""P3V3-Q001""","""Walmart Inc.""","""cross_year""","""medium""",0.103,0.842,0.842,0.501,"""Strong similarity""",7675.3,6287,0.029649
"""P3V2-Q006""","""MICROSOFT CORP""","""local""","""medium""",0.127,0.848,0.733,0.437,"""Strong similarity""",2627.4,7394,0.027006
"""P3V3-Q004""","""RADIAN GROUP INC, NETFLIX INC,…","""cross_company""","""medium""",0.088,0.826,0.573,0.438,"""Strong similarity""",3321.2,5946,0.026034
"""P3V3-Q007""","""Tesla, Inc.""","""local""","""easy""",0.099,0.804,0.684,0.444,"""Strong similarity""",2467.1,6262,0.022398
"""P3V3-Q002""","""Meta Platforms, Inc.""","""cross_year""","""hard""",0.066,0.832,0.737,0.449,"""Strong similarity""",2858.2,11541,0.054807
"""P3V2-Q001""","""EXXON MOBIL CORP""","""local""","""easy""",0.109,0.802,0.483,0.409,"""Strong similarity""",2238.0,5976,0.020724


In [9]:

# Summary statistics

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print(f"Questions Evaluated: {df_results.height}")
print(f"Average BERTScore F1: {df_results['bertscore_f1'].mean():.3f}")
print(f"Average BLEURT: {df_results['bleurt'].mean():.3f}")
print(f"Average ROUGE-L: {df_results['rouge_l'].mean():.3f}")
print(f"Average Cosine Similarity: {df_results['cosine_sim'].mean():.3f}")
print(f"Average Eval Time: {df_results['eval_time_ms'].mean()/1000:.1f}s")
print(f"Total Synthesis Cost: ${df_results['synthesis_cost'].sum():.4f}")
print(f"Total Runtime: ~{(len(TEST_QUESTION_IDS) * SLEEP_BETWEEN_QUERIES + df_results['eval_time_ms'].sum()/1000)/60:.1f} min")


SUMMARY STATISTICS
Questions Evaluated: 6
Average BERTScore F1: 0.826
Average BLEURT: 0.446
Average ROUGE-L: 0.099
Average Cosine Similarity: 0.675
Average Eval Time: 3.5s
Total Synthesis Cost: $0.1806
Total Runtime: ~7.4 min
