# AI Letter Generation Pipeline

A LangGraph-based pipeline for generating client letters using Google Gemini with human-in-the-loop feedback.

## Features

- **Dual Template System**: Combines general guidance + specific letter type templates
- **Looping Workflow**: Iteratively improve templates based on feedback
- **Human-in-the-Loop**: Review evaluation results and provide feedback
- **Session Tracking**: All iterations and data saved for review
- **Hallucination Detection**: Automatic evaluation of generated letters

## Quick Start

1. Set up your `.env` file with `GOOGLE_API_KEY`
2. Run Cell 1 to initialize
3. Run Cell 8 for interactive letter generation
4. Or run Cell 10 for a quick test


In [None]:
# Cell 1: Setup
import os
import subprocess
import sys

# Install dependencies
def install_packages():
    packages = ['google-genai', 'langgraph', 'python-docx', 'pdfplumber', 'python-dotenv']
    for package in packages:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])

install_packages()

# Import required libraries
from typing import Dict, List, Optional, TypedDict
import re
from pathlib import Path
from dotenv import load_dotenv
from google import genai
from google.genai import types
from langgraph.graph import StateGraph, END

# Load environment variables
load_dotenv()

# Configure Google API
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
MODEL = os.getenv('MODEL', 'gemini-2.5-flash')  # Updated default
TEMPERATURE_DRAFT = 0.25
# Initialize client
client = genai.Client(api_key=GOOGLE_API_KEY)

print(f"Model configured: {MODEL}")
print(f"API Key loaded: {'Yes' if GOOGLE_API_KEY else 'No'}")

In [107]:
# Cell 2: Helper Functions
import docx
import pdfplumber
import time
import json
from datetime import datetime
import hashlib
from typing import Any

# Global metrics tracking
llm_metrics = {
    'total_api_calls': 0,
    'total_tokens_in': 0,
    'total_tokens_out': 0,
    'total_latency': 0.0,
    'api_calls_by_type': {}
}

def reset_llm_metrics():
    """Reset LLM metrics for new iteration."""
    global llm_metrics
    llm_metrics = {
        'total_api_calls': 0,
        'total_tokens_in': 0,
        'total_tokens_out': 0,
        'total_latency': 0.0,
        'api_calls_by_type': {}
    }

def get_llm_metrics_snapshot():
    """Get current LLM metrics snapshot."""
    return {
        'timestamp': datetime.now().isoformat(),
        'total_api_calls': llm_metrics['total_api_calls'],
        'total_tokens_in': llm_metrics['total_tokens_in'],
        'total_tokens_out': llm_metrics['total_tokens_out'],
        'total_latency_seconds': round(llm_metrics['total_latency'], 2),
        'average_latency_seconds': round(llm_metrics['total_latency'] / llm_metrics['total_api_calls'], 2) if llm_metrics['total_api_calls'] > 0 else 0,
        'api_calls_breakdown': dict(llm_metrics['api_calls_by_type'])
    }

def read_file_smart(path: str) -> str:
    """Read file content from various formats."""
    path_obj = Path(path)
    
    if path_obj.suffix in ['.txt', '.md']:
        with open(path, 'r', encoding='utf-8') as f:
            return f.read()
    
    elif path_obj.suffix == '.docx':
        # Read docx file
        doc = docx.Document(path)
        content = []
        for para in doc.paragraphs:
            if para.text.strip():
                content.append(para.text)
        return '\n'.join(content)
    
    elif path_obj.suffix == '.pdf':
        # Read pdf file
        content = []
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    content.append(text)
        return '\n'.join(content)
    
    else:
        return f"[Unsupported format: {path_obj.suffix}]"


def read_case_folder(case_folder: str) -> str:
    """Read all files in a case folder and aggregate content."""
    folder_path = Path(case_folder)
    if not folder_path.exists():
        raise ValueError(f"Case folder not found: {case_folder}")
    
    all_content = []
    
    # Read all docx and pdf files in the folder
    for file_path in sorted(folder_path.glob("*")):
        if file_path.suffix in ['.docx', '.pdf']:
            content = read_file_smart(str(file_path))
            all_content.append(f"\n--- File: {file_path.name} ---\n{content}")
    
    if not all_content:
        raise ValueError(f"No case files found in: {case_folder}")
    
    return '\n\n'.join(all_content)


def read_general_template() -> str:
    """Read the general guidance template."""
    templates_dir = Path("data/templates")
    general_path = templates_dir / "guidance.md"
    
    if not general_path.exists():
        raise ValueError(f"General guidance template not found: {general_path}")
    
    return read_file_smart(str(general_path))


def read_specific_template(template_type: str = "annual_review") -> str:
    """Read the specific template for the given type."""
    templates_dir = Path("data/templates")
    specific_path = templates_dir / f"{template_type}_guidance.md"
    
    if not specific_path.exists():
        # Return empty string if specific template doesn't exist
        print(f"⚠️  Specific template not found: {specific_path}")
        return ""
    
    return read_file_smart(str(specific_path))


def combine_templates(general_template: str, specific_template: str) -> str:
    """Combine general and specific templates for letter generation."""
    if specific_template:
        return f"{general_template}\n\n--- SPECIFIC TEMPLATE GUIDANCE ---\n\n{specific_template}"
    return general_template


def create_session_folder(client_name: str) -> Path:
    """Create a folder for this session's data."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    session_dir = Path(f"sessions/{client_name}_{timestamp}")
    session_dir.mkdir(parents=True, exist_ok=True)
    
    # Create templates subdirectory
    templates_dir = session_dir / "templates"
    templates_dir.mkdir(exist_ok=True)
    
    return session_dir


def get_template_hash(template_content: str) -> str:
    """Generate a hash of template content to detect changes."""
    return hashlib.md5(template_content.encode()).hexdigest()


def save_template_if_changed(session_dir: Path, iteration: int, template_type: str, 
                           current_content: str, previous_hash: str = None) -> tuple[bool, str, str]:
    """Save template if it has changed from previous iteration.
    
    Returns: (changed, new_hash, file_path)
    """
    current_hash = get_template_hash(current_content)
    changed = current_hash != previous_hash if previous_hash else True
    
    if changed:
        # Save template
        templates_dir = session_dir / "templates"
        file_path = templates_dir / f"iteration_{iteration}_{template_type}.md"
        with open(file_path, "w") as f:
            f.write(current_content)
        return True, current_hash, str(file_path.relative_to(session_dir))
    
    return False, current_hash, None


def save_iteration_data(session_dir: Path, iteration: int, state: Dict) -> None:
    """Save data from current iteration to file, including templates and LLM metrics."""
    # Get previous template hashes if available
    prev_general_hash = state.get('general_template_hash')
    prev_specific_hash = state.get('specific_template_hash')
    
    # Save templates and check if they changed
    general_changed, general_hash, general_path = save_template_if_changed(
        session_dir, iteration, "general", 
        state.get("general_template", ""), prev_general_hash
    )
    
    specific_changed, specific_hash, specific_path = save_template_if_changed(
        session_dir, iteration, "specific",
        state.get("specific_template", ""), prev_specific_hash
    )
    
    # Update state with new hashes for next iteration
    state['general_template_hash'] = general_hash
    state['specific_template_hash'] = specific_hash
    
    # Get LLM metrics snapshot
    llm_metrics_snapshot = get_llm_metrics_snapshot()
    
    # Build iteration data
    data = {
        "iteration": iteration,
        "timestamp": datetime.now().isoformat(),
        "letter_length": len(state.get("letter", "")),
        "hallucinations_count": len(state.get("hallucinations", [])),
        "hallucinations": state.get("hallucinations", [])[:10],  # Save first 10
        "evaluation_notes": state.get("evaluation_notes", ""),
        "user_decision": state.get("user_decision", ""),
        "user_feedback": state.get("user_feedback", ""),
        "template_updated": {
            "general": state.get("general_template_updated", False),
            "specific": state.get("specific_template_updated", False)
        },
        "template_files": {
            "general": general_path if general_changed else f"Same as iteration {iteration-1}" if iteration > 1 else general_path,
            "specific": specific_path if specific_changed else f"Same as iteration {iteration-1}" if iteration > 1 else specific_path
        },
        "template_changed_this_iteration": {
            "general": general_changed,
            "specific": specific_changed
        },
        # Add complete evaluation data
        "evaluation": state.get("evaluation", {}),
        # Add LLM performance metrics
        "llm_metrics": llm_metrics_snapshot
    }
    
    # Save iteration data
    with open(session_dir / f"iteration_{iteration}.json", "w") as f:
        json.dump(data, f, indent=2)
    
    # Save letter HTML
    letter_path = session_dir / f"letter_iteration_{iteration}.html"
    with open(letter_path, "w") as f:
        f.write(state.get("letter", ""))
    
    # Save LLM metrics as separate file for easy analysis
    metrics_path = session_dir / f"llm_metrics_iteration_{iteration}.json"
    with open(metrics_path, "w") as f:
        json.dump(llm_metrics_snapshot, f, indent=2)
    
    print(f"💾 Saved iteration {iteration} data to {session_dir.name}")
    if general_changed or specific_changed:
        changed_templates = []
        if general_changed:
            changed_templates.append("general")
        if specific_changed:
            changed_templates.append("specific")
        print(f"📝 Saved updated templates: {', '.join(changed_templates)}")
    print(f"📊 LLM Metrics: {llm_metrics_snapshot['total_api_calls']} API calls, {llm_metrics_snapshot['total_latency_seconds']}s total latency")


def save_session_summary(session_dir: Path, final_state: Dict) -> None:
    """Save a summary of the entire session."""
    iterations_data = final_state.get("iterations_data", [])
    
    summary = f"""Session Summary
===============
Client: {final_state.get('client_name', 'Unknown')}
Template Type: {final_state.get('template_type', 'Unknown')}
Total Iterations: {final_state.get('iteration', 0)}
Session Folder: {session_dir}

Template Evolution:
"""
    
    # Track template changes
    template_changes = []
    for i in range(1, final_state.get('iteration', 0) + 1):
        iter_file = session_dir / f"iteration_{i}.json"
        if iter_file.exists():
            with open(iter_file, 'r') as f:
                iter_data = json.load(f)
                if iter_data.get('template_changed_this_iteration', {}).get('general') or \
                   iter_data.get('template_changed_this_iteration', {}).get('specific'):
                    changes = []
                    if iter_data.get('template_changed_this_iteration', {}).get('general'):
                        changes.append("general")
                    if iter_data.get('template_changed_this_iteration', {}).get('specific'):
                        changes.append("specific")
                    template_changes.append(f"  - Iteration {i}: Updated {', '.join(changes)} template(s)")
    
    if template_changes:
        summary += "\n".join(template_changes) + "\n"
    else:
        summary += "  - No template changes during session\n"
    
    # Add LLM metrics summary
    summary += "\nLLM Performance Metrics:\n"
    total_api_calls = 0
    total_latency = 0.0
    
    for i in range(1, final_state.get('iteration', 0) + 1):
        metrics_file = session_dir / f"llm_metrics_iteration_{i}.json"
        if metrics_file.exists():
            with open(metrics_file, 'r') as f:
                metrics = json.load(f)
                api_calls = metrics.get('total_api_calls', 0)
                latency = metrics.get('total_latency_seconds', 0)
                total_api_calls += api_calls
                total_latency += latency
                summary += f"  - Iteration {i}: {api_calls} API calls, {latency}s latency\n"
    
    summary += f"\nTotal Session Metrics:\n"
    summary += f"  - Total API Calls: {total_api_calls}\n"
    summary += f"  - Total Latency: {total_latency:.2f}s\n"
    summary += f"  - Average Latency per Call: {(total_latency/total_api_calls):.2f}s\n" if total_api_calls > 0 else ""
    
    summary += f"""
Iteration Details:
"""
    
    for i, data in enumerate(iterations_data, 1):
        # Fix: Safely handle user_feedback that might be None
        feedback = data.get('user_feedback') or 'None'
        summary += f"""
Iteration {i}:
  - Hallucinations: {data.get('hallucinations_count', 0)}
  - User Decision: {data.get('user_decision', 'N/A')}
  - Feedback: {feedback[:100]}...
"""
    
    summary += f"""
Final Status: Letter {'Accepted' if final_state.get('user_decision') == 'accept' else 'In Progress'}
Final Letter: {session_dir / f"letter_iteration_{final_state.get('iteration', 0)}.html"}
Final Templates: {session_dir / "templates"}
LLM Metrics: {session_dir / "llm_metrics_iteration_*.json"}

All iteration files saved in: {session_dir}
"""
    
    with open(session_dir / "session_summary.txt", "w") as f:
        f.write(summary)
    
    print(f"\n📄 Session summary saved to: {session_dir / 'session_summary.txt'}")


def load_template_from_session(session_dir: Path, iteration: int, template_type: str) -> str:
    """Load a specific template version from a session."""
    # First try the exact iteration
    template_path = session_dir / "templates" / f"iteration_{iteration}_{template_type}.md"
    
    if template_path.exists():
        with open(template_path, 'r') as f:
            return f.read()
    
    # If not found, look for the most recent version before this iteration
    for i in range(iteration - 1, 0, -1):
        template_path = session_dir / "templates" / f"iteration_{i}_{template_type}.md"
        if template_path.exists():
            with open(template_path, 'r') as f:
                return f.read()
    
    return ""


def compare_templates(template1: str, template2: str) -> str:
    """Simple comparison of two templates showing additions/deletions."""
    lines1 = template1.splitlines()
    lines2 = template2.splitlines()
    
    # Simple diff visualization
    diff_output = []
    
    # This is a simplified diff - in production you might use difflib
    if lines1 != lines2:
        diff_output.append("Template differences detected:")
        diff_output.append(f"  - Previous version: {len(lines1)} lines")
        diff_output.append(f"  - New version: {len(lines2)} lines")
        diff_output.append(f"  - Line difference: {len(lines2) - len(lines1)}")
    else:
        diff_output.append("No changes detected in template")
    
    return "\n".join(diff_output)


def strip_markdown(text: str) -> str:
    """Remove basic markdown syntax from text."""
    # Remove headers
    text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
    # Remove bold/italic
    text = re.sub(r'\*{1,2}([^*]+)\*{1,2}', r'\1', text)
    text = re.sub(r'_{1,2}([^_]+)_{1,2}', r'\1', text)
    # Remove code blocks
    text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    # Remove links
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    return text.strip()


def provider_llm(**kwargs) -> Any:
    """Create a Google Gemini completion with structured output support."""
    global llm_metrics
    
    # Extract messages, temperature, and response format
    messages = kwargs.get('messages', [])
    temperature = kwargs.get('temperature', 0.4)
    response_format = kwargs.get('response_format')
    system_prompt = kwargs.get('system_prompt', '')
    call_type = kwargs.get('call_type', 'general')  # For categorizing API calls
    
    # Convert messages to Gemini format
    if messages and messages[0]['role'] == 'user':
        prompt = messages[0]['content']
    else:
        prompt = str(messages)
    
    # Add system prompt if provided
    if system_prompt:
        prompt = f"{system_prompt}\n\n{prompt}"
    
    # Estimate input tokens (rough approximation: 1 token ≈ 4 chars)
    input_tokens = len(prompt) // 4
    
    print(f"🤖 Calling Gemini API (temp={temperature}, structured={response_format is not None})...")
    start_time = time.time()
    
    try:
        # Configure generation based on whether we want structured output
        if response_format:
            # Use JSON mode for structured output - NO MAX TOKEN LIMIT
            response = client.models.generate_content(
                model=MODEL,
                contents=prompt,
                config=types.GenerateContentConfig(
                    temperature=temperature,
                    candidate_count=1,
                    response_mime_type="application/json",
                    response_schema=response_format
                )
            )
        else:
            # Regular text generation - NO MAX TOKEN LIMIT
            response = client.models.generate_content(
                model=MODEL,
                contents=prompt,
                config=types.GenerateContentConfig(
                    temperature=temperature,
                    candidate_count=1,
                )
            )
        
        # Check if response has candidates
        if not response.candidates:
            raise ValueError("No candidates in API response")
        
        # Extract text from response
        candidate = response.candidates[0]
        
        # Try different ways to extract content
        result_text = None
        
        # Method 1: Try content.parts[0].text
        if hasattr(candidate, 'content') and candidate.content:
            if hasattr(candidate.content, 'parts') and candidate.content.parts:
                if hasattr(candidate.content.parts[0], 'text'):
                    result_text = candidate.content.parts[0].text
                else:
                    # Parts might be a list of dicts
                    result_text = str(candidate.content.parts[0])
        
        # Method 2: Try direct text attribute
        if not result_text and hasattr(candidate, 'text'):
            result_text = candidate.text
        
        # Method 3: Try to convert to string
        if not result_text:
            result_text = str(candidate)
        
        if not result_text or result_text == "None":
            raise ValueError("Could not extract text from API response")
        
        # Calculate metrics
        elapsed = time.time() - start_time
        output_tokens = len(result_text) // 4  # Rough approximation
        
        # Update global metrics
        llm_metrics['total_api_calls'] += 1
        llm_metrics['total_tokens_in'] += input_tokens
        llm_metrics['total_tokens_out'] += output_tokens
        llm_metrics['total_latency'] += elapsed
        llm_metrics['api_calls_by_type'][call_type] = llm_metrics['api_calls_by_type'].get(call_type, 0) + 1
        
        print(f"✅ API call completed in {elapsed:.2f}s (in: ~{input_tokens} tokens, out: ~{output_tokens} tokens)")
        
        # If structured output was requested, parse JSON
        if response_format:
            try:
                # Clean the text - sometimes it has extra whitespace
                result_text = result_text.strip()
                return json.loads(result_text)
            except json.JSONDecodeError as e:
                print(f"⚠️  Failed to parse JSON response: {e}")
                print(f"   Raw text: {result_text[:500]}...")
                # Try to extract JSON from the response
                match = re.search(r'\{.*\}', result_text, re.DOTALL)
                if match:
                    return json.loads(match.group())
                raise ValueError(f"Invalid JSON response: {result_text[:200]}...")
        
        return result_text
        
    except Exception as e:
        elapsed = time.time() - start_time
        # Still track failed calls
        llm_metrics['total_api_calls'] += 1
        llm_metrics['total_latency'] += elapsed
        llm_metrics['api_calls_by_type'][f"{call_type}_failed"] = llm_metrics['api_calls_by_type'].get(f"{call_type}_failed", 0) + 1
        
        print(f"❌ API call failed after {elapsed:.2f}s: {type(e).__name__}: {str(e)}")
        
        # Log more details for debugging
        if hasattr(e, 'response'):
            print(f"   Response status: {getattr(e.response, 'status_code', 'N/A')}")
        
        # Re-raise the exception to prevent hanging
        raise Exception(f"Gemini API error: {str(e)}")


def test_api():
    """Test basic API connectivity."""
    print("🧪 Testing Gemini API connection...")
    try:
        response = provider_llm(
            messages=[{"role": "user", "content": "Say 'Hello World' and nothing else."}],
            temperature=0,
            call_type='test'
        )
        print(f"✅ API test successful! Response: {response.strip()}")
        return True
    except Exception as e:
        print(f"❌ API test failed: {e}")
        return False

In [108]:
# Cell 3: Chain Functions

from pydantic import BaseModel
from typing import List

# Pydantic models for structured outputs
class EvaluationResult(BaseModel):
    hallucinations: List[str]
    template_needing_improvement: str
    quality_notes: str
    improvement_suggestions: str


def draft_letter(general_template: str, specific_template: str, case: str) -> str:
    """Generate a letter from templates and case information."""
    print("📝 Drafting letter...")
    
    # Combine templates
    combined_template = combine_templates(general_template, specific_template)
    
    prompt = f"""You are an expert paraplanner writing flawless client letters.

Template/Guidance:
{combined_template}

Case Information:
{case}

Generate a professional letter following the template structure and incorporating the case details.
Output only the letter content in HTML format, no explanations."""
    
    try:
        result = provider_llm(
            messages=[{"role": "user", "content": prompt}],
            temperature=TEMPERATURE_DRAFT,
            call_type='draft_letter'
        )
        print(f"✅ Letter drafted successfully ({len(result)} chars)")
        return result
    except Exception as e:
        print(f"❌ Failed to draft letter: {e}")
        raise


def evaluate_letter(letter: str, source: str, general_template: str, specific_template: str) -> Dict[str, any]:
    """Evaluate letter for hallucinations and quality with structured output."""
    print("🔍 Evaluating letter...")
    
    prompt = f"""You are a strict auditor evaluating a client letter.

Letter to audit:
{letter}

Source information:
{source}

General template used:
{general_template[:500]}...

Specific template used:
{specific_template[:500] if specific_template else "None"}...

Provide a comprehensive evaluation including:
1. List any claims in the letter that are NOT supported by the source information (hallucinations)
2. Note which template (general or specific) might need improvement (must be one of: general, specific, both, none)
3. Overall quality assessment"""
    
    try:
        response = client.models.generate_content(
            model=MODEL,
            contents=prompt,
            config=types.GenerateContentConfig(
                temperature=0,
                candidate_count=1,
                response_mime_type="application/json",
                response_schema=EvaluationResult
            )
        )
        
        # Parse the response
        evaluation = response.parsed
        
        print(f"✅ Evaluation complete: {len(evaluation.hallucinations)} hallucinations found")
        
        # Convert to dict for compatibility
        return {
            "hallucinations": evaluation.hallucinations,
            "template_needing_improvement": evaluation.template_needing_improvement,
            "quality_notes": evaluation.quality_notes,
            "improvement_suggestions": evaluation.improvement_suggestions
        }
        
    except Exception as e:
        print(f"❌ Failed to evaluate letter: {e}")
        return {
            "hallucinations": [],
            "template_needing_improvement": "unknown",
            "quality_notes": f"Evaluation failed: {str(e)}",
            "improvement_suggestions": ""
        }


def improve_template_with_ai(template: str, template_type: str, evaluation: Dict, letter: str = None) -> str:
    """Improve a template based on AI evaluation WITHOUT seeing the actual letter."""
    print(f"🔧 AI improving {template_type} template...")
    
    hallucinations = evaluation.get("hallucinations", [])
    suggestions = evaluation.get("improvement_suggestions", "")
    
    # Create abstract examples of issues WITHOUT showing actual letter content
    issue_patterns = []
    for h in hallucinations[:5]:
        # Extract the type of issue without the specific content
        if isinstance(h, dict):
            issue_type = h.get('type', 'unknown')
            issue_patterns.append(f"- {issue_type} issue found")
        else:
            # Try to categorize the hallucination type
            if any(word in str(h).lower() for word in ['date', 'time', 'when']):
                issue_patterns.append("- Date/time related issue")
            elif any(word in str(h).lower() for word in ['name', 'person', 'company']):
                issue_patterns.append("- Name/entity related issue")
            elif any(word in str(h).lower() for word in ['amount', 'fee', 'charge', '%']):
                issue_patterns.append("- Financial figure related issue")
            else:
                issue_patterns.append("- Unsupported claim issue")
    
    prompt = f"""You are an LLM prompt engineer improving a template.

Template Type: {template_type}
Current template:
{template}

Issues found (WITHOUT specific client details):
- Total hallucinations: {len(hallucinations)}
- Issue patterns:
{chr(10).join(issue_patterns)}

Improvement suggestions from evaluation:
{suggestions}

IMPORTANT: You are improving the TEMPLATE ONLY. You have NOT seen any actual client letter or specific details.

Improve the template to prevent these types of issues while maintaining its structure and purpose.
Make the template more specific about:
1. What information to extract from case files
2. What NOT to include or make up
3. How to handle missing information

Output only the improved template."""
    
    try:
        result = provider_llm(
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            call_type='improve_template'
        )
        print(f"✅ {template_type.capitalize()} template improved")
        return result
    except Exception as e:
        print(f"❌ Failed to improve {template_type} template: {e}")
        raise


def improve_template_with_feedback(template: str, template_type: str, feedback: str) -> str:
    """Improve a template based on manual feedback."""
    print(f"🔧 Manually improving {template_type} template...")
    
    prompt = f"""You are an LLM prompt engineer improving a template based on user feedback.

Template Type: {template_type}
Current template:
{template}

User feedback:
{feedback}

Improve the template based on this feedback while maintaining its structure and purpose.
Output only the improved template."""
    
    try:
        result = provider_llm(
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            call_type='improve_template_manual'
        )
        print(f"✅ {template_type.capitalize()} template improved with manual feedback")
        return result
    except Exception as e:
        print(f"❌ Failed to improve {template_type} template: {e}")
        raise


def generate_ai_feedback(letter: str, evaluation: Dict, case_content: str) -> str:
    """Generate AI feedback for template improvement WITHOUT exposing letter content."""
    hallucinations = evaluation.get("hallucinations", [])
    quality_notes = evaluation.get("quality_notes", "")
    
    # Analyze patterns without revealing specific content
    patterns = {
        'dates': 0,
        'names': 0,
        'amounts': 0,
        'unsupported': 0
    }
    
    for h in hallucinations:
        h_str = str(h).lower()
        if any(word in h_str for word in ['date', 'time', 'when', 'april', 'may', 'june']):
            patterns['dates'] += 1
        elif any(word in h_str for word in ['name', 'person', 'company', 'mr', 'mrs']):
            patterns['names'] += 1
        elif any(word in h_str for word in ['amount', 'fee', 'charge', '%', '£', '$']):
            patterns['amounts'] += 1
        else:
            patterns['unsupported'] += 1
    
    prompt = f"""You are a senior paraplanner providing feedback to improve letter templates.

Evaluation results:
- Total hallucinations found: {len(hallucinations)}
- Quality notes: {quality_notes}
- Template needing improvement: {evaluation.get('template_needing_improvement', 'unknown')}

Pattern analysis (no specific details):
- Date-related issues: {patterns['dates']}
- Name-related issues: {patterns['names']}
- Amount-related issues: {patterns['amounts']}
- Unsupported claims: {patterns['unsupported']}

Provide specific, actionable feedback to improve the templates. Focus on:
1. How to prevent these TYPES of hallucinations
2. What specific guidance to add to templates
3. Which template needs the most work

Be concise and practical. Do NOT reference any specific client details."""

    return provider_llm(
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        call_type='generate_feedback'
    )

In [109]:
# Cell 3.5: Evaluation Types and Structures

from enum import Enum
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional

class HallucinationType(Enum):
    """Types of hallucinations we detect"""
    FACTUAL_ERROR = "factual_error"  # Contradicts case file
    UNSUPPORTED_CLAIM = "unsupported_claim"  # Not found in case
    FABRICATED_DETAIL = "fabricated_detail"  # Completely made up
    INCONSISTENCY = "inconsistency"  # Contradicts elsewhere in letter

@dataclass
class CaseFact:
    """A fact extracted from case files"""
    content: str  # The actual fact
    fact_type: str  # date/name/amount/reference/claim/quote
    source_file: str  # Which case file it came from
    context: str  # Surrounding text for verification
    
    def to_dict(self):
        return asdict(self)
    
    @classmethod
    def from_dict(cls, data):
        return cls(**data)

@dataclass
class LetterFact:
    """A fact extracted from the letter"""
    content: str  # The actual fact/claim
    fact_type: str  # date/name/amount/reference/claim/quote
    location: str  # Where in letter (section/paragraph)
    requires_verification: bool = True

@dataclass
class FactVerification:
    """Result of verifying a letter fact"""
    letter_fact: LetterFact
    verdict: str  # SUPPORTED/CONTRADICTED/UNSUPPORTED
    explanation: str
    supporting_case_facts: List[CaseFact]
    correct_info: Optional[str] = None
    hallucination_type: Optional[HallucinationType] = None

# State for the evaluation workflow
class EvaluationState(TypedDict):
    letter: str
    case_facts: List[CaseFact]  # Pre-extracted from case files
    letter_facts: List[LetterFact]
    verifications: List[FactVerification]
    hallucinations: List[Dict]
    score: float
    detailed_report: Dict[str, any]

In [110]:
# Cell 4.5: Case Fact Extraction (One-time per session)

from pydantic import BaseModel
from typing import List

# Pydantic models for case facts extraction
class CaseFactExtraction(BaseModel):
    content: str
    fact_type: str
    source_file: str
    context: str

class CaseFactsResponse(BaseModel):
    facts: List[CaseFactExtraction]


def extract_case_facts_once(case_content: str, case_folder: str) -> List[CaseFact]:
    """Extract all verifiable facts from case files - called ONCE per session.
    
    This is expensive so we cache the results for reuse across iterations.
    """
    print("📊 Extracting facts from case files (one-time operation)...")
    
    prompt = f"""You are a meticulous fact extractor for legal/financial documents.
    
Extract ALL verifiable facts from these case files:

{case_content}

For each fact, extract:
1. The exact fact/information
2. Type: date, name, amount, reference, claim, quote, or other
3. Which file it came from (based on file markers in the text)
4. Surrounding context (1-2 sentences around the fact)

Focus on:
- ALL dates (meetings, deadlines, document dates, birth dates, etc.)
- ALL names (people, companies, products, places)
- ALL numbers (amounts, percentages, reference numbers, ages)
- ALL specific claims, decisions, or agreements
- ALL quoted statements
- ALL document references

Be exhaustive - we'll use this as our source of truth."""
    
    try:
        # Use provider_llm with structured output
        facts_response = provider_llm(
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            response_format=CaseFactsResponse,
            call_type='extract_case_facts'
        )
        
        # Parse the response
    
        # Convert to CaseFact objects
        case_facts = []
        for fact in facts_response.get('facts', []):
            case_facts.append(CaseFact(
                content=fact.get('content', ''),
                fact_type=fact.get('fact_type', ''),
                source_file=fact.get('source_file', ''),
                context=fact.get('context', '')
            ))
        
        print(f"✅ Extracted {len(case_facts)} facts from case files")

        
        # Group by type for summary
        fact_types = {}
        for fact in case_facts:
            fact_types[fact.fact_type] = fact_types.get(fact.fact_type, 0) + 1
        
        print("📋 Fact breakdown:")
        for fact_type, count in sorted(fact_types.items()):
            print(f"   - {fact_type}: {count}")
        
        return case_facts
        
    except Exception as e:
        print(f"❌ Failed to extract case facts: {e}")
        return []


def save_case_facts(session_dir: Path, case_facts: List[CaseFact]) -> None:
    """Save extracted case facts to file for debugging/review."""
    facts_file = session_dir / "case_facts.json"
    
    facts_data = [fact.to_dict() for fact in case_facts]
    
    with open(facts_file, 'w', encoding='utf-8') as f:
        json.dump(facts_data, f, indent=2, ensure_ascii=False)
    
    print(f"💾 Saved case facts to {facts_file.name}")


def load_case_facts(session_dir: Path) -> List[CaseFact]:
    """Load previously extracted case facts."""
    facts_file = session_dir / "case_facts.json"
    
    if not facts_file.exists():
        return []
    
    with open(facts_file, 'r', encoding='utf-8') as f:
        facts_data = json.load(f)
    
    return [CaseFact.from_dict(fact_dict) for fact_dict in facts_data]

In [111]:
# Cell 5.5: Enhanced Evaluation Workflow

from langgraph.graph import StateGraph, END
from pydantic import BaseModel
from typing import List, Optional
import concurrent.futures
import time

# Configuration for batch processing
VERIFICATION_BATCH_SIZE = 5  # Number of facts to verify in one API call
MAX_PARALLEL_BATCHES = 3    # Number of concurrent batch verifications

# Pydantic models for structured outputs
class LetterFactExtraction(BaseModel):
    content: str
    fact_type: str
    location: str
    requires_verification: bool

class LetterFactsResponse(BaseModel):
    facts: List[LetterFactExtraction]

class FactVerificationResult(BaseModel):
    verdict: str
    explanation: str
    correct_info: Optional[str] = None
    hallucination_type: Optional[str] = None

class BatchVerificationResult(BaseModel):
    fact_content: str
    verdict: str
    explanation: str
    correct_info: Optional[str] = None
    hallucination_type: Optional[str] = None

class BatchVerificationResponse(BaseModel):
    verifications: List[BatchVerificationResult]


def extract_letter_facts_node(state: EvaluationState) -> EvaluationState:
    """Extract all facts from the generated letter with structured output."""
    print("  📋 Extracting facts from letter...")
    
    prompt = f"""You are a meticulous fact extractor analyzing a financial advisory letter.

Letter to analyze:
{state['letter']}

Extract the MOST IMPORTANT verifiable facts/claims from the letter (maximum 20-30 facts):
1. The exact fact/claim as written
2. Type: date, name, amount, reference, claim, quote, or other
3. Location in letter (section/paragraph description)
4. Whether it requires verification (true for all factual claims)

Focus on KEY facts only:
- Critical dates (meeting dates, deadlines)
- Client and company names
- Specific amounts or percentages
- Important claims or recommendations
- Key document references

IMPORTANT: Only extract the most significant facts - aim for 100 facts maximum."""
    
    try:
        facts_response = provider_llm(
            messages=[{"role": "user", "content": prompt}],
            temperature=0,  
            response_format=LetterFactsResponse
        )
        
        
        
        letter_facts = []
        for fact in facts_response.get('facts', []):
            letter_facts.append(LetterFact(
                content=fact.get('content', ''),
                fact_type=fact.get('fact_type', ''),
                location=fact.get('location', ''),
                requires_verification=fact.get('requires_verification', True)
            ))
        
        
            
            letter_facts = letter_facts
        
        state['letter_facts'] = letter_facts
        print(f"  ✅ Extracted {len(letter_facts)} facts from letter")
        
    except Exception as e:
        print(f"  ❌ Failed to extract letter facts: {e}")
        state['letter_facts'] = []
    
    return state


def verify_fact_batch(batch: List[LetterFact], case_facts: List[CaseFact], batch_num: int) -> List[FactVerification]:
    """Verify a batch of facts in a single API call."""
    print(f"  🔍 Verifying batch {batch_num} ({len(batch)} facts)...")
    
    # Build a comprehensive prompt for batch verification
    facts_to_verify = []
    for fact in batch:
        facts_to_verify.append(f"""
Fact {batch.index(fact) + 1}:
- Content: {fact.content}
- Type: {fact.fact_type}
- Location: {fact.location}""")
    
    # Get relevant case facts for all facts in batch
    all_keywords = []
    for fact in batch:
        all_keywords.extend(fact.content.lower().split()[:3])
    
    case_facts_context = "\n".join([
        f"- {cf.content} (Type: {cf.fact_type}, From: {cf.source_file})"
        for cf in case_facts
        if any(keyword in cf.content.lower() for keyword in all_keywords)
    ][:30])  # Limit context size
    
    prompt = f"""You are a fact-checker verifying multiple claims in a letter against source documents.

FACTS TO VERIFY:
{chr(10).join(facts_to_verify)}

RELEVANT CASE FACTS:
{case_facts_context if case_facts_context else "No directly matching facts found"}

CASE FACTS SUMMARY:
Total facts available: {len(case_facts)}
Types: {', '.join(set(cf.fact_type for cf in case_facts))}

For EACH fact above:
1. Determine if it is SUPPORTED, CONTRADICTED, or UNSUPPORTED by case facts
2. If CONTRADICTED, provide the correct information
3. Categorize any issues with lowercase values: factual_error, unsupported_claim, fabricated_detail, or null

Return a verification for each fact in order."""
    
    try:
        response = provider_llm(
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            response_format=BatchVerificationResponse
        )
        
        verifications = []
        results = response.get('verifications', [])
        
        for i, (fact, result) in enumerate(zip(batch, results)):
            # Handle enum case mismatch
            hallucination_type_str = result.get('hallucination_type')
            hallucination_type = None
            if hallucination_type_str and result.get('verdict') != 'SUPPORTED':
                type_mapping = {
                    'factual_error': HallucinationType.FACTUAL_ERROR,
                    'unsupported_claim': HallucinationType.UNSUPPORTED_CLAIM,
                    'fabricated_detail': HallucinationType.FABRICATED_DETAIL,
                    'inconsistency': HallucinationType.INCONSISTENCY
                }
                hallucination_type = type_mapping.get(hallucination_type_str.lower())
            
            # Find supporting case facts if supported
            supporting_facts = []
            if result.get('verdict') == 'SUPPORTED':
                for cf in case_facts:
                    if any(word in cf.content.lower() for word in fact.content.lower().split()):
                        supporting_facts.append(cf)
                        if len(supporting_facts) >= 3:
                            break
            
            verification = FactVerification(
                letter_fact=fact,
                verdict=result.get('verdict', 'UNSUPPORTED'),
                explanation=result.get('explanation', 'No explanation provided'),
                supporting_case_facts=supporting_facts,
                correct_info=result.get('correct_info'),
                hallucination_type=hallucination_type
            )
            verifications.append(verification)
        
        print(f"  ✅ Batch {batch_num} verified successfully")
        return verifications
        
    except Exception as e:
        print(f"  ❌ Failed to verify batch {batch_num}: {str(e)}")
        # Return unsupported for all facts in failed batch
        return [
            FactVerification(
                letter_fact=fact,
                verdict='UNSUPPORTED',
                explanation=f"Batch verification failed: {str(e)}",
                supporting_case_facts=[],
                hallucination_type=HallucinationType.UNSUPPORTED_CLAIM
            ) for fact in batch
        ]


def verify_facts_node(state: EvaluationState) -> EvaluationState:
    """Verify facts in parallel batches."""
    print("  🔍 Verifying facts in parallel batches...")
    
    facts_to_verify = state['letter_facts'][:20]  # Limit total facts
    total_facts = len(facts_to_verify)
    
    # Split facts into batches
    batches = []
    for i in range(0, total_facts, VERIFICATION_BATCH_SIZE):
        batch = facts_to_verify[i:i + VERIFICATION_BATCH_SIZE]
        batches.append(batch)
    
    print(f"  📦 Created {len(batches)} batches (size {VERIFICATION_BATCH_SIZE} each)")
    
    all_verifications = []
    
    # Process batches in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_PARALLEL_BATCHES) as executor:
        # Submit all batch verification tasks
        future_to_batch = {
            executor.submit(verify_fact_batch, batch, state['case_facts'], i+1): (i, batch)
            for i, batch in enumerate(batches)
        }
        
        # Collect results as they complete
        for future in concurrent.futures.as_completed(future_to_batch):
            batch_idx, batch = future_to_batch[future]
            try:
                verifications = future.result()
                all_verifications.extend(verifications)
                
                # Quick summary of batch results
                supported = sum(1 for v in verifications if v.verdict == 'SUPPORTED')
                print(f"     Batch {batch_idx+1}: {supported}/{len(verifications)} supported")
                
            except Exception as e:
                print(f"  ❌ Batch {batch_idx+1} failed: {str(e)}")
                # Add failed verifications for this batch
                for fact in batch:
                    all_verifications.append(FactVerification(
                        letter_fact=fact,
                        verdict='UNSUPPORTED',
                        explanation=f"Verification failed: {str(e)}",
                        supporting_case_facts=[],
                        hallucination_type=HallucinationType.UNSUPPORTED_CLAIM
                    ))
    
    # Sort verifications back to original order
    state['verifications'] = sorted(all_verifications, 
                                  key=lambda v: facts_to_verify.index(v.letter_fact))
    
    print(f"  ✅ Completed parallel verification of {len(state['verifications'])} facts")
    
    return state


def compile_results_node(state: EvaluationState) -> EvaluationState:
    """Compile verification results into final evaluation with metrics."""
    print("  📊 Compiling evaluation results...")
    
    # Count verdicts
    verdict_counts = {
        'SUPPORTED': 0,
        'CONTRADICTED': 0,
        'UNSUPPORTED': 0
    }
    
    hallucinations = []
    hallucination_types = {}
    
    for verification in state['verifications']:
        verdict_counts[verification.verdict] += 1
        
        if verification.verdict in ['CONTRADICTED', 'UNSUPPORTED']:
            # This is a hallucination
            hallucination = {
                'fact': verification.letter_fact.content,
                'type': verification.hallucination_type.value if verification.hallucination_type else 'unknown',
                'location': verification.letter_fact.location,
                'explanation': verification.explanation,
                'verdict': verification.verdict
            }
            
            if verification.correct_info:
                hallucination['correct_info'] = verification.correct_info
            
            hallucinations.append(hallucination)
            
            # Track hallucination types
            h_type = verification.hallucination_type.value if verification.hallucination_type else 'unknown'
            hallucination_types[h_type] = hallucination_types.get(h_type, 0) + 1
    
    # Calculate score (percentage of supported facts)
    total_facts = len(state['verifications'])
    score = (verdict_counts['SUPPORTED'] / total_facts * 100) if total_facts > 0 else 100
    
    # Determine which template needs work
    if len(hallucinations) == 0:
        template_needing_improvement = 'none'
    elif hallucination_types.get('FABRICATED_DETAIL', 0) > hallucination_types.get('FACTUAL_ERROR', 0):
        template_needing_improvement = 'specific'  # Specific details are being fabricated
    elif any('date' in h['fact'].lower() or 'amount' in h['fact'].lower() for h in hallucinations):
        template_needing_improvement = 'general'  # General structure issues
    else:
        template_needing_improvement = 'both'
    
    # Build evaluation metrics snapshot
    metrics_snapshot = {
        'timestamp': datetime.now().isoformat(),
        'total_facts_extracted': len(state['letter_facts']),
        'total_facts_verified': total_facts,
        'verification_summary': verdict_counts,
        'hallucination_breakdown': hallucination_types,
        'accuracy_score': round(score, 1),
        'verdict': 'PASS' if score >= 95 else 'NEEDS_IMPROVEMENT',
        'template_recommendation': template_needing_improvement,
        'processing_stats': {
            'batch_size': VERIFICATION_BATCH_SIZE,
            'parallel_batches': MAX_PARALLEL_BATCHES,
            'facts_per_batch': VERIFICATION_BATCH_SIZE
        }
    }
    
    # Build detailed report
    detailed_report = {
        'total_facts_checked': total_facts,
        'verification_summary': verdict_counts,
        'hallucination_types': hallucination_types,
        'score': round(score, 1),
        'verdict': 'PASS' if score >= 95 else 'NEEDS_IMPROVEMENT',
        'top_issues': hallucinations[:5],  # Top 5 issues
        'metrics_snapshot': metrics_snapshot
    }
    
    # Update state
    state['hallucinations'] = hallucinations
    state['score'] = score
    state['detailed_report'] = detailed_report
    state['metrics_snapshot'] = metrics_snapshot  # Store for iteration tracking
    
    # Also return in format compatible with main workflow
    evaluation_result = {
        'hallucinations': hallucinations,
        'template_needing_improvement': template_needing_improvement,
        'quality_notes': f"Score: {score:.1f}% - {len(hallucinations)} issues found",
        'improvement_suggestions': f"Focus on {template_needing_improvement} template to reduce {', '.join(hallucination_types.keys())}",
        'facts_checked': total_facts,
        'score': score,
        'detailed_report': detailed_report
    }
    
    print(f"  ✅ Evaluation complete: {score:.1f}% accuracy, {len(hallucinations)} issues")
    
    return state


def build_evaluation_workflow():
    """Build the enhanced evaluation workflow."""
    workflow = StateGraph(EvaluationState)
    
    # Add nodes
    workflow.add_node("extract_letter_facts", extract_letter_facts_node)
    workflow.add_node("verify_facts", verify_facts_node)
    workflow.add_node("compile_results", compile_results_node)
    
    # Define flow
    workflow.set_entry_point("extract_letter_facts")
    workflow.add_edge("extract_letter_facts", "verify_facts")
    workflow.add_edge("verify_facts", "compile_results")
    workflow.add_edge("compile_results", END)
    
    return workflow.compile()


def run_enhanced_evaluation(letter: str, case_facts: List[CaseFact]) -> Dict[str, any]:
    """Run the enhanced evaluation workflow."""
    print("🔬 Running enhanced evaluation workflow...")
    
    # Initialize evaluation state
    eval_state = {
        'letter': letter,
        'case_facts': case_facts,
        'letter_facts': [],
        'verifications': [],
        'hallucinations': [],
        'score': 0.0,
        'detailed_report': {},
        'metrics_snapshot': {}
    }
    
    # Build and run evaluation workflow
    eval_app = build_evaluation_workflow()
    final_eval_state = eval_app.invoke(eval_state)
    
    # Extract the evaluation result in format expected by main workflow
    return {
        'hallucinations': final_eval_state['hallucinations'],
        'template_needing_improvement': determine_template_from_results(final_eval_state),
        'quality_notes': f"Score: {final_eval_state['score']:.1f}% - {len(final_eval_state['hallucinations'])} issues found",
        'improvement_suggestions': generate_improvement_suggestions(final_eval_state),
        'facts_checked': len(final_eval_state['letter_facts']),
        'score': final_eval_state['score'],
        'detailed_report': final_eval_state['detailed_report'],
        'metrics_snapshot': final_eval_state.get('metrics_snapshot', {})
    }


def determine_template_from_results(eval_state: Dict) -> str:
    """Determine which template needs improvement based on evaluation."""
    hallucinations = eval_state['hallucinations']
    
    if not hallucinations:
        return 'none'
    
    # Analyze hallucination patterns
    fabricated = sum(1 for h in hallucinations if h.get('type') == 'FABRICATED_DETAIL')
    factual_errors = sum(1 for h in hallucinations if h.get('type') == 'FACTUAL_ERROR')
    unsupported = sum(1 for h in hallucinations if h.get('type') == 'UNSUPPORTED_CLAIM')
    
    if fabricated > factual_errors:
        return 'specific'  # Too many made-up details
    elif factual_errors > unsupported:
        return 'general'  # Structure causing errors
    else:
        return 'both'  # Both need work


def generate_improvement_suggestions(eval_state: Dict) -> str:
    """Generate specific improvement suggestions based on evaluation."""
    report = eval_state['detailed_report']
    hallucination_types = report.get('hallucination_types', {})
    
    suggestions = []
    
    if hallucination_types.get('FABRICATED_DETAIL', 0) > 0:
        suggestions.append("Add more specific constraints to prevent fabrication")
    
    if hallucination_types.get('FACTUAL_ERROR', 0) > 0:
        suggestions.append("Clarify fact extraction rules in templates")
    
    if hallucination_types.get('UNSUPPORTED_CLAIM', 0) > 0:
        suggestions.append("Emphasize using only provided information")
    
    return "; ".join(suggestions) if suggestions else "Templates performing well"

In [112]:
# Cell 6: LangGraph Workflow

# Define state - Updated to include case facts
class LetterState(TypedDict):
    # Input data
    case_folder: str
    template_type: str
    client_name: str
    session_dir: Path
    
    # Current templates (updated each iteration)
    general_template: str
    specific_template: str
    
    # Case content and facts
    case_content: str
    case_facts: List[CaseFact]  # Extracted once, reused
    case_facts_extracted: bool
    
    # Current iteration
    iteration: int
    letter: str
    evaluation: Dict[str, any]
    
    # User interaction
    user_decision: str  # 'accept', 'ai_feedback', 'manual_feedback'
    user_feedback: Optional[str]
    
    # Track what was updated
    general_template_updated: bool
    specific_template_updated: bool
    
    # History tracking
    iterations_data: List[Dict]


# Define nodes with debugging
def draft_node(state: LetterState) -> LetterState:
    """Draft the letter."""
    print(f"\n🔄 DRAFT NODE: Starting iteration {state['iteration'] + 1}...")
    
    # Reset LLM metrics for this iteration
    reset_llm_metrics()
    
    try:
        state['iteration'] += 1
        state['letter'] = draft_letter(
            state['general_template'], 
            state['specific_template'], 
            state['case_content']
        )
        print("🔄 DRAFT NODE: Completed successfully")
    except Exception as e:
        print(f"🔄 DRAFT NODE: Failed - {e}")
        raise
    return state


def evaluate_node(state: LetterState) -> LetterState:
    """Evaluate the letter using enhanced evaluation."""
    print("\n🔄 EVALUATE NODE: Starting...")
    
    # Debug: Check if case facts are available
    print(f"   Case facts available: {len(state.get('case_facts', [])) if state.get('case_facts') else 0}")
    
    try:
        # Use enhanced evaluation with cached case facts
        if state.get('case_facts'):
            # Use the enhanced evaluation
            print("   Using enhanced evaluation workflow...")
            state['evaluation'] = run_enhanced_evaluation(
                state['letter'], 
                state['case_facts']
            )
        else:
            # Fallback to original evaluation
            print("⚠️  No case facts available, using simple evaluation")
            state['evaluation'] = evaluate_letter(
                state['letter'], 
                state['case_content'],
                state['general_template'],
                state['specific_template']
            )
        
        state['hallucinations'] = state['evaluation'].get('hallucinations', [])
        state['evaluation_notes'] = state['evaluation'].get('quality_notes', '')
        print("🔄 EVALUATE NODE: Completed successfully")
    except Exception as e:
        print(f"🔄 EVALUATE NODE: Failed - {e}")
        import traceback
        traceback.print_exc()
        state['evaluation'] = {
            "hallucinations": [],
            "template_needing_improvement": "unknown",
            "quality_notes": f"Evaluation failed: {str(e)}"
        }
        state['hallucinations'] = []
    return state


def human_review_node(state: LetterState) -> LetterState:
    """Get human input on the evaluation."""
    print("\n🔄 HUMAN REVIEW NODE: Starting...")
    
    # Display results
    print(f"\n{'='*60}")
    print(f"ITERATION {state['iteration']} RESULTS")
    print(f"{'='*60}")
    
    evaluation = state['evaluation']
    hallucinations = evaluation.get('hallucinations', [])
    
    print(f"\n📊 Evaluation Summary:")
    print(f"  - Hallucinations found: {len(hallucinations)}")
    print(f"  - Facts checked: {evaluation.get('facts_checked', 'N/A')}")
    print(f"  - Evaluation score: {evaluation.get('score', 'N/A')}")
    print(f"  - Template needing work: {evaluation.get('template_needing_improvement', 'unknown')}")
    print(f"  - Quality notes: {evaluation.get('quality_notes', 'N/A')}")
    
    # Show detailed report if available
    if 'detailed_report' in evaluation:
        report = evaluation['detailed_report']
        print(f"\n📈 Detailed Analysis:")
        print(f"  - Verdict: {report.get('verdict', 'N/A')}")
        verification = report.get('verification_summary', {})
        print(f"  - Facts supported: {verification.get('SUPPORTED', 0)}")
        print(f"  - Facts contradicted: {verification.get('CONTRADICTED', 0)}")
        print(f"  - Facts unsupported: {verification.get('UNSUPPORTED', 0)}")
    
    if hallucinations:
        print(f"\n🚨 Issues detected:")
        for i, h in enumerate(hallucinations[:5], 1):
            # Handle both string and dict hallucinations
            if isinstance(h, str):
                print(f"  {i}. {h}")
            else:
                print(f"  {i}. {h.get('fact', h)}  [{h.get('type', 'unknown')}]")
                if 'explanation' in h:
                    print(f"     → {h['explanation']}")
                if 'correct_info' in h and h['correct_info'] != 'Not found in case files':
                    print(f"     ✓ Should be: {h['correct_info']}")
        if len(hallucinations) > 5:
            print(f"  ... and {len(hallucinations) - 5} more")
    
    # Save current iteration data
    save_iteration_data(state['session_dir'], state['iteration'], state)
    
    # Print letter location
    print(f"\n📄 Current letter saved to: {state['session_dir'] / f'letter_iteration_{state['iteration']}.html'}")
    
    # Get user decision - Simplified UI
    print("\n📋 Options:")
    print("1. Accept letter as is")
    print("2. Use AI to improve templates based on evaluation")
    print("3. Provide manual feedback for template improvement")
    
    while True:
        choice = input("\nYour choice (1-3): ").strip()
        
        if choice == "1":
            state['user_decision'] = 'accept'
            print("✅ Letter accepted!")
            break
        elif choice == "2":
            state['user_decision'] = 'ai_feedback'
            print("🤖 Using AI feedback to improve templates...")
            break
        elif choice == "3":
            feedback = input("\nEnter your feedback for template improvement: ").strip()
            if feedback:
                state['user_feedback'] = feedback
                state['user_decision'] = 'manual_feedback'
                print("📝 Manual feedback recorded")
                break
            else:
                print("❌ No feedback provided, please try again")
        else:
            print("❌ Invalid choice, please enter 1, 2, or 3")
    
    # Add to history
    iteration_data = {
        'iteration': state['iteration'],
        'hallucinations_count': len(hallucinations),
        'user_decision': state['user_decision'],
        'user_feedback': state.get('user_feedback', ''),
        'evaluation': evaluation
    }
    state['iterations_data'].append(iteration_data)
    
    print("🔄 HUMAN REVIEW NODE: Completed")
    return state


def ai_improve_templates_node(state: LetterState) -> LetterState:
    """Improve templates using AI feedback."""
    print("\n🔄 AI IMPROVE TEMPLATES NODE: Starting...")
    
    evaluation = state['evaluation']
    which_template = evaluation.get('template_needing_improvement', 'both')
    
    # Reset update flags
    state['general_template_updated'] = False
    state['specific_template_updated'] = False
    
    try:
        # Generate AI feedback
        ai_feedback = generate_ai_feedback(
            state['letter'], 
            evaluation, 
            state['case_content']
        )
        print(f"🤖 AI feedback generated")
        
        # Improve templates based on evaluation
        if which_template in ['general', 'both']:
            print("📝 Improving general template...")
            state['general_template'] = improve_template_with_ai(
                state['general_template'],
                'general',
                evaluation,
                state['letter']
            )
            state['general_template_updated'] = True
        
        if which_template in ['specific', 'both']:
            print("📝 Improving specific template...")
            state['specific_template'] = improve_template_with_ai(
                state['specific_template'],
                'specific',
                evaluation,
                state['letter']
            )
            state['specific_template_updated'] = True
        
        print("🔄 AI IMPROVE TEMPLATES NODE: Completed")
    except Exception as e:
        print(f"🔄 AI IMPROVE TEMPLATES NODE: Failed - {e}")
        raise
    
    return state


def manual_improve_templates_node(state: LetterState) -> LetterState:
    """Improve templates using manual feedback."""
    print("\n🔄 MANUAL IMPROVE TEMPLATES NODE: Starting...")
    
    feedback = state.get('user_feedback', '')
    if not feedback:
        print("⚠️  No feedback provided, skipping improvement")
        return state
    
    # Reset update flags
    state['general_template_updated'] = False
    state['specific_template_updated'] = False
    
    try:
        # Show options BEFORE asking for choice
        print("\nWhich template should be improved?")
        print("1. General template only")
        print("2. Specific template only")
        print("3. Both templates")
        
        choice = input("\nYour choice (1-3): ").strip()
        
        if choice in ["1", "3"]:
            print("📝 Improving general template with manual feedback...")
            state['general_template'] = improve_template_with_feedback(
                state['general_template'],
                'general',
                feedback
            )
            state['general_template_updated'] = True
        
        if choice in ["2", "3"]:
            print("📝 Improving specific template with manual feedback...")
            state['specific_template'] = improve_template_with_feedback(
                state['specific_template'],
                'specific',
                feedback
            )
            state['specific_template_updated'] = True
        
        if choice not in ["1", "2", "3"]:
            print("⚠️  Invalid choice, skipping template improvement")
        
        print("🔄 MANUAL IMPROVE TEMPLATES NODE: Completed")
    except Exception as e:
        print(f"🔄 MANUAL IMPROVE TEMPLATES NODE: Failed - {e}")
        raise
    
    return state


def save_final_node(state: LetterState) -> LetterState:
    """Save the final accepted letter."""
    print("\n🔄 SAVE FINAL NODE: Starting...")
    
    # Save final letter
    final_letter_path = state['session_dir'] / "final_accepted_letter.html"
    with open(final_letter_path, "w") as f:
        f.write(state['letter'])
    
    # Save session summary
    save_session_summary(state['session_dir'], state)
    
    print(f"✅ Final letter saved to: {final_letter_path}")
    print("🔄 SAVE FINAL NODE: Completed")
    
    return state


def route_human_decision(state: LetterState) -> str:
    """Route based on human decision."""
    decision = state.get('user_decision', '')
    if decision == 'accept':
        return 'save_final'
    elif decision == 'ai_feedback':
        return 'ai_improve_templates'
    elif decision == 'manual_feedback':
        return 'manual_improve_templates'
    else:
        # Default to accept if something goes wrong
        return 'save_final'


# Build graph
def build_letter_graph():
    workflow = StateGraph(LetterState)
    
    # Add nodes
    workflow.add_node("draft", draft_node)
    workflow.add_node("evaluate", evaluate_node)
    workflow.add_node("human_review", human_review_node)
    workflow.add_node("ai_improve_templates", ai_improve_templates_node)
    workflow.add_node("manual_improve_templates", manual_improve_templates_node)
    workflow.add_node("save_final", save_final_node)
    
    # Add edges
    workflow.set_entry_point("draft")
    workflow.add_edge("draft", "evaluate")
    workflow.add_edge("evaluate", "human_review")
    
    # Conditional routing from human review
    workflow.add_conditional_edges(
        "human_review",
        route_human_decision,
        {
            "save_final": "save_final",
            "ai_improve_templates": "ai_improve_templates",
            "manual_improve_templates": "manual_improve_templates"
        }
    )
    
    # Both improvement nodes loop back to draft
    workflow.add_edge("ai_improve_templates", "draft")
    workflow.add_edge("manual_improve_templates", "draft")
    
    # Save final goes to END
    workflow.add_edge("save_final", END)
    
    return workflow.compile()


# Convenience function for running a session
def run_interactive_session(case_folder: str, template_type: str = "annual_review", max_iterations: int = 10) -> Dict[str, any]:
    """Run an interactive letter generation session with looping."""
    
    print(f"\n{'='*60}")
    print(f"🚀 STARTING LETTER GENERATION SESSION")
    print(f"{'='*60}")
    
    # Extract client name
    client_name = case_folder.split('/')[-1]
    print(f"📁 Client: {client_name}")
    print(f"📄 Template type: {template_type}")
    
    try:
        # Create session folder
        session_dir = create_session_folder(client_name)
        print(f"📂 Session folder: {session_dir}")
        
        # Read initial data
        print("\n📖 Loading data...")
        case_content = read_case_folder(case_folder)
        general_template = read_general_template()
        specific_template = read_specific_template(template_type)
        
        print(f"✅ Loaded {len(case_content)} chars of case data")
        print(f"✅ Loaded general template ({len(general_template)} chars)")
        print(f"✅ Loaded specific template ({len(specific_template)} chars)")
        
        # Extract case facts ONCE
        case_facts = extract_case_facts_once(case_content, case_folder)
        
        # Save case facts for debugging/review
        if case_facts:
            save_case_facts(session_dir, case_facts)
        else:
            print("⚠️  WARNING: No case facts extracted! Enhanced evaluation will not be available.")
        
        # Initialize state
        initial_state = {
            "case_folder": case_folder,
            "template_type": template_type,
            "client_name": client_name,
            "session_dir": session_dir,
            "case_content": case_content,
            "case_facts": case_facts,
            "case_facts_extracted": True,
            "general_template": general_template,
            "specific_template": specific_template,
            "iteration": 0,
            "letter": "",
            "evaluation": {},
            "user_decision": "",
            "user_feedback": None,
            "general_template_updated": False,
            "specific_template_updated": False,
            "iterations_data": []
        }
        
        # Build and run workflow
        print("\n🔄 Starting workflow...")
        app = build_letter_graph()
        
        # Run with iteration limit check
        final_state = app.invoke(initial_state)
        
        # Check if we hit iteration limit
        if final_state['iteration'] >= max_iterations:
            print(f"\n⚠️  Maximum iterations ({max_iterations}) reached!")
        
        print(f"\n{'='*60}")
        print(f"✅ SESSION COMPLETED")
        print(f"{'='*60}")
        print(f"📈 Total iterations: {final_state['iteration']}")
        print(f"📂 All files saved in: {session_dir}")
        
        return final_state
        
    except Exception as e:
        print(f"\n❌ SESSION FAILED: {type(e).__name__}: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

In [113]:
# Cell 7: Display Functions (kept for compatibility)

# Note: Most display functionality is now integrated into the workflow itself.
# These functions are kept for backward compatibility and standalone use.

def preview_letter(letter: str, max_chars: int = 500) -> None:
    """Show a preview of the letter with character count."""
    print(f"Letter Preview (first {max_chars} chars of {len(letter)} total):")
    print("-" * 60)
    print(letter[:max_chars] + "..." if len(letter) > max_chars else letter)
    print("-" * 60)

In [114]:
# Cell 9: Simple Test Function

def quick_test_session(client_letter: str = "A") -> None:
    """Quick test with pre-selected client."""
    case_folder = f"data/case-files/{client_letter}"
    
    try:
        # Run the session
        final_state = run_interactive_session(case_folder, "annual_review")
        
        print(f"\n✨ Quick test completed for Client {client_letter}")
        print(f"📂 Results saved in: {final_state['session_dir']}")
        
    except Exception as e:
        print(f"❌ Test failed: {e}")

In [115]:
# Cell 8: Template Tracking Utilities

def browse_session_templates(session_name: str = None) -> None:
    """Browse templates from a specific session or let user choose."""
    sessions_dir = Path("sessions")
    
    if not session_name:
        # Show available sessions
        print("\n📁 AVAILABLE SESSIONS")
        print("="*60)
        
        session_folders = sorted(sessions_dir.iterdir(), reverse=True)
        valid_sessions = []
        
        for folder in session_folders:
            if folder.is_dir() and (folder / "templates").exists():
                valid_sessions.append(folder)
        
        if not valid_sessions:
            print("No sessions with templates found.")
            return
        
        for i, folder in enumerate(valid_sessions[:10], 1):
            print(f"{i}. {folder.name}")
        
        choice = input("\nSelect session number: ").strip()
        if not choice.isdigit() or int(choice) < 1 or int(choice) > len(valid_sessions):
            print("Invalid choice.")
            return
        
        session_dir = valid_sessions[int(choice) - 1]
    else:
        session_dir = sessions_dir / session_name
        if not session_dir.exists():
            print(f"Session not found: {session_name}")
            return
    
    # Browse templates in selected session
    templates_dir = session_dir / "templates"
    if not templates_dir.exists():
        print("No templates found in this session.")
        return
    
    print(f"\n📂 TEMPLATES IN: {session_dir.name}")
    print("="*60)
    
    # Group templates by type
    general_templates = sorted(templates_dir.glob("iteration_*_general.md"))
    specific_templates = sorted(templates_dir.glob("iteration_*_specific.md"))
    
    if general_templates:
        print("\n📄 General Templates:")
        for template in general_templates:
            size = template.stat().st_size
            print(f"  - {template.name} ({size:,} bytes)")
    
    if specific_templates:
        print("\n📄 Specific Templates:")
        for template in specific_templates:
            size = template.stat().st_size
            print(f"  - {template.name} ({size:,} bytes)")
    
    # Option to view specific template
    print("\nOptions:")
    print("1. View a specific template")
    print("2. Compare two templates")
    print("3. Export final templates")
    print("4. Return")
    
    choice = input("\nYour choice (1-4): ").strip()
    
    if choice == "1":
        view_template_from_session(session_dir)
    elif choice == "2":
        compare_session_templates(session_dir)
    elif choice == "3":
        export_final_templates(session_dir)


def view_template_from_session(session_dir: Path) -> None:
    """View a specific template from a session."""
    templates_dir = session_dir / "templates"
    all_templates = sorted(templates_dir.glob("iteration_*.md"))
    
    print("\nAvailable templates:")
    for i, template in enumerate(all_templates, 1):
        print(f"{i}. {template.name}")
    
    choice = input("\nSelect template number: ").strip()
    if choice.isdigit() and 1 <= int(choice) <= len(all_templates):
        template_path = all_templates[int(choice) - 1]
        print(f"\n📄 {template_path.name}")
        print("="*60)
        with open(template_path, 'r') as f:
            content = f.read()
            # Show first 1000 chars
            if len(content) > 1000:
                print(content[:1000])
                print(f"\n... (truncated, showing 1000/{len(content)} chars)")
            else:
                print(content)


def compare_session_templates(session_dir: Path) -> None:
    """Compare two template versions from a session."""
    templates_dir = session_dir / "templates"
    
    # Let user choose template type
    print("\nTemplate type to compare:")
    print("1. General templates")
    print("2. Specific templates")
    
    type_choice = input("Your choice (1-2): ").strip()
    if type_choice == "1":
        template_type = "general"
    elif type_choice == "2":
        template_type = "specific"
    else:
        print("Invalid choice.")
        return
    
    # Find all versions of this template type
    templates = sorted(templates_dir.glob(f"iteration_*_{template_type}.md"))
    
    if len(templates) < 2:
        print(f"Not enough {template_type} template versions to compare.")
        return
    
    print(f"\nAvailable {template_type} template versions:")
    for i, template in enumerate(templates, 1):
        print(f"{i}. {template.name}")
    
    # Get first template
    first_choice = input("\nSelect first template number: ").strip()
    if not first_choice.isdigit() or int(first_choice) < 1 or int(first_choice) > len(templates):
        print("Invalid choice.")
        return
    
    # Get second template
    second_choice = input("Select second template number: ").strip()
    if not second_choice.isdigit() or int(second_choice) < 1 or int(second_choice) > len(templates):
        print("Invalid choice.")
        return
    
    template1 = templates[int(first_choice) - 1]
    template2 = templates[int(second_choice) - 1]
    
    with open(template1, 'r') as f:
        content1 = f.read()
    with open(template2, 'r') as f:
        content2 = f.read()
    
    print(f"\n📊 Comparing {template1.name} vs {template2.name}")
    print("="*60)
    print(compare_templates(content1, content2))


def export_final_templates(session_dir: Path) -> None:
    """Export the final version of templates from a session."""
    templates_dir = session_dir / "templates"
    
    # Find the latest version of each template
    general_templates = sorted(templates_dir.glob("iteration_*_general.md"))
    specific_templates = sorted(templates_dir.glob("iteration_*_specific.md"))
    
    if not general_templates and not specific_templates:
        print("No templates found to export.")
        return
    
    # Create export directory
    export_dir = Path("exported_templates")
    export_dir.mkdir(exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    client_name = session_dir.name.split('_')[0]
    
    exported = []
    
    if general_templates:
        latest_general = general_templates[-1]
        export_path = export_dir / f"{client_name}_general_{timestamp}.md"
        with open(latest_general, 'r') as f_in:
            with open(export_path, 'w') as f_out:
                f_out.write(f_in.read())
        exported.append(f"General: {export_path.name}")
    
    if specific_templates:
        latest_specific = specific_templates[-1]
        export_path = export_dir / f"{client_name}_specific_{timestamp}.md"
        with open(latest_specific, 'r') as f_in:
            with open(export_path, 'w') as f_out:
                f_out.write(f_in.read())
        exported.append(f"Specific: {export_path.name}")
    
    print(f"\n✅ Templates exported to {export_dir}:")
    for item in exported:
        print(f"  - {item}")


def get_template_evolution_report(session_name: str) -> None:
    """Generate a detailed report of how templates evolved in a session."""
    session_dir = Path("sessions") / session_name
    
    if not session_dir.exists():
        print(f"Session not found: {session_name}")
        return
    
    print(f"\n📊 TEMPLATE EVOLUTION REPORT")
    print(f"Session: {session_name}")
    print("="*60)
    
    # Read all iteration files
    iterations = []
    for iter_file in sorted(session_dir.glob("iteration_*.json")):
        with open(iter_file, 'r') as f:
            iterations.append(json.load(f))
    
    if not iterations:
        print("No iteration data found.")
        return
    
    # Track template changes
    print("\n📈 Template Change Timeline:")
    for iter_data in iterations:
        iter_num = iter_data['iteration']
        print(f"\nIteration {iter_num}:")
        print(f"  - Timestamp: {iter_data['timestamp']}")
        print(f"  - Hallucinations: {iter_data['hallucinations_count']}")
        
        if iter_data.get('template_changed_this_iteration', {}).get('general'):
            print(f"  - ✏️  General template updated")
        if iter_data.get('template_changed_this_iteration', {}).get('specific'):
            print(f"  - ✏️  Specific template updated")
        
        if iter_data.get('user_feedback'):
            print(f"  - 💬 User feedback: {iter_data['user_feedback'][:50]}...")
        
        if iter_data.get('user_decision'):
            print(f"  - 🎯 Decision: {iter_data['user_decision']}")
    
    # Summary statistics
    total_iterations = len(iterations)
    general_changes = sum(1 for i in iterations if i.get('template_changed_this_iteration', {}).get('general'))
    specific_changes = sum(1 for i in iterations if i.get('template_changed_this_iteration', {}).get('specific'))
    
    print(f"\n📊 Summary Statistics:")
    print(f"  - Total iterations: {total_iterations}")
    print(f"  - General template changes: {general_changes}")
    print(f"  - Specific template changes: {specific_changes}")
    print(f"  - Final hallucinations: {iterations[-1]['hallucinations_count'] if iterations else 0}")


# Quick access functions
def list_recent_sessions_with_templates() -> None:
    """List recent sessions that have template changes."""
    sessions_dir = Path("sessions")
    
    print("\n📁 RECENT SESSIONS WITH TEMPLATE CHANGES")
    print("="*60)
    
    sessions_with_changes = []
    
    for session_dir in sorted(sessions_dir.iterdir(), reverse=True)[:20]:
        if session_dir.is_dir():
            # Check if templates were modified
            summary_file = session_dir / "session_summary.txt"
            if summary_file.exists():
                with open(summary_file, 'r') as f:
                    content = f.read()
                    if "Template Evolution:" in content and "Updated" in content:
                        sessions_with_changes.append(session_dir)
    
    if not sessions_with_changes:
        print("No sessions with template changes found.")
        return
    
    for i, session in enumerate(sessions_with_changes[:10], 1):
        print(f"{i}. {session.name}")
        
        # Show brief template change info
        summary_file = session / "session_summary.txt"
        with open(summary_file, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if "Updated" in line and "template" in line:
                    print(f"   {line.strip()}")

In [None]:
# Cell 8: Main Interactive Demo

print("🚀 AI LETTER GENERATION - INTERACTIVE MODE")
print("="*60)

# Choose a client
print("\nSelect a client:")
print("1. Client A")
print("2. Client B") 
print("3. Client C")
print("4. Client E")

client_choice = input("\nEnter choice (1-4): ").strip()

if client_choice in ["1", "2", "3", "4"]:
    # Map choice to client
    client_map = {
        "1": ("A", "Client A"),
        "2": ("B", "Client B"),
        "3": ("C", "Client C"),
        "4": ("E", "Client E")
    }
    
    folder_suffix, client_name = client_map[client_choice]
    case_folder = f"data/case-files/{folder_suffix}"
    
    print(f"\n✅ Selected: {client_name}")
    
    try:
        # Run the interactive session - the workflow handles everything!
        final_state = run_interactive_session(case_folder, "annual_review", max_iterations=5)
        
        print("\n🎉 Session complete!")
        print(f"📂 All files saved in: {final_state['session_dir']}")
            
    except Exception as e:
        print(f"\n❌ Error: {e}")
        print("\nTroubleshooting:")
        print("• Check your GOOGLE_API_KEY in .env")
        print("• Verify case files exist in the folder")
        print("• Ensure all dependencies are installed")
        
else:
    print("❌ Invalid choice. Please run the cell again.")

In [None]:
# Cell 10: Simple Test Demo
# Run this cell to test the reworked chain execution

print("🧪 TESTING REWORKED WORKFLOW")
print("="*60)

# First test API
print("\n1️⃣ Testing API connection...")
if not test_api():
    print("❌ API test failed! Please check your setup.")
else:
    print("✅ API is working!")
    
    # Quick workflow test
    print("\n2️⃣ Testing workflow components...")
    try:
        # Test template reading
        general = read_general_template()
        specific = read_specific_template("annual_review")
        print(f"✅ Templates loaded: general ({len(general)} chars), specific ({len(specific)} chars)")
        
        # Test case reading
        case_content = read_case_folder("data/case-files/A")
        print(f"✅ Case content loaded: {len(case_content)} chars")
        
        # Test case fact extraction
        print("\n3️⃣ Testing case fact extraction...")
        case_facts = extract_case_facts_once(case_content[:5000], "data/case-files/A")  # Use first 5000 chars for speed
        print(f"✅ Case facts extracted: {len(case_facts)} facts")
        if case_facts:
            print("\nSample facts:")
            for fact in case_facts[:3]:
                print(f"  - {fact.fact_type}: {fact.content[:80]}...")
        
        # Test letter generation
        print("\n4️⃣ Testing letter generation...")
        test_letter = draft_letter(general, specific, case_content)
        print(f"✅ Letter generated: {len(test_letter)} chars")
        
        # Test simple evaluation
        print("\n5️⃣ Testing simple evaluation...")
        evaluation = evaluate_letter(test_letter, case_content, general, specific)
        print(f"✅ Simple evaluation complete:")
        print(f"   - Hallucinations: {len(evaluation.get('hallucinations', []))}")
        print(f"   - Template needing work: {evaluation.get('template_needing_improvement', 'unknown')}")
        
        # Test enhanced evaluation
        if case_facts:
            print("\n6️⃣ Testing enhanced evaluation...")
            enhanced_eval = run_enhanced_evaluation(test_letter[:3000], case_facts)  # Use first 3000 chars for speed
            print(f"✅ Enhanced evaluation complete:")
            print(f"   - Facts checked: {enhanced_eval.get('facts_checked', 0)}")
            print(f"   - Score: {enhanced_eval.get('score', 'N/A')}%")
            print(f"   - Hallucinations: {len(enhanced_eval.get('hallucinations', []))}")
        
        print("\n✅ ALL COMPONENTS WORKING!")
        print("\n💡 You can now run Cell 11 for the full interactive experience")
        
    except Exception as e:
        print(f"\n❌ Component test failed: {e}")
        import traceback
        traceback.print_exc()