In [1]:
# -*- coding: utf-8 -*-
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.16.1
#   kernelspec:
#     display_name: Python 3 (ipykernel)
#     language: python
#     name: python3
# ---

# # Long-Form Creative Writing Benchmark - Results Viewer & Metrics Calculator

# ## Imports and Setup

import json
import pandas as pd
import numpy as np
from IPython.display import HTML, display
from collections import defaultdict
import re
import os
import sys
import statistics as stats
import html # For escaping text in slop profile
from typing import Dict, List, Any, Optional, Tuple, Union
# --- ADDED: import for distance computation ---
from scipy.spatial.distance import pdist, squareform

# --- Add core directory to Python path ---
# Assumes the notebook is in the 'longform-creative-writing' directory
NOTEBOOK_DIR = os.path.dirname(os.path.abspath("__file__")) # Get current dir
CORE_DIR = os.path.join(NOTEBOOK_DIR, 'core')
#if CORE_DIR not in sys.path:
#    sys.path.insert(0, CORE_DIR)
#    print(f"Added to sys.path: {CORE_DIR}")

# --- Import metrics functions ---
#from core.metrics import calculate_complexity_index, calculate_slop_index_new, get_multi_prompt_ngrams, get_top_repetitive_words
from core.metrics import calculate_slop_index_new
try:
    # Import specific functions needed
    from core.metrics import (
        calculate_complexity_index,
        calculate_slop_index_new, # Use the newer slop index
        get_multi_prompt_ngrams,
        get_top_repetitive_words
    )

    metrics_available = True
    print("Successfully imported metrics functions from core.metrics")
except ImportError as e:
    print(f"Error importing metrics from core.metrics: {e}", file=sys.stderr)
    print("Metrics calculation will be skipped.", file=sys.stderr)
    metrics_available = False
    # Define dummy functions if import fails
    def calculate_complexity_index(text: str) -> float: return -1.0
    def calculate_slop_index_new(text: str, debug: bool = False) -> float: return -1.0
    def get_multi_prompt_ngrams(prompts_data: Dict[str, List[str]], n: int, top_k: int = 20, min_prompt_ids: int = 2) -> List[Tuple[Tuple[str, ...], int]]: return []
    def get_top_repetitive_words(texts_with_ids: List[Tuple[str, str]], top_n: int = 20, min_repetition_count: int = 5, min_prompt_ids: int = 2) -> List[Tuple[str, float]]: return []


# ## Configuration

# --- File Paths ---
# Assumes 'results' and 'data' directories are siblings of the notebook or configured relative paths
RESULTS_DIR = os.path.join(NOTEBOOK_DIR, "results")
DATA_DIR = os.path.join(NOTEBOOK_DIR, "data")
RUNS_FILE = "longform_bench_runs.json"
#RUNS_FILE = "longform_bench_runs_llama4_testing.json"
# Using a generic name for the output file containing metrics
ELO_RESULTS_WITH_METRICS_FILE = os.path.join(RESULTS_DIR, "longform_results_with_metrics.json")

# Global dictionary for criteria weights (lowercase keys for case-insensitive matching)
try:
    with open(os.path.join(DATA_DIR,'criteria_weights.json'), 'r') as f:
        CRITERIA_WEIGHTS = json.load(f)
except FileNotFoundError:
    print(f"Warning: criteria_weights.json not found in {DATA_DIR}. Using default weight 1.0.")
    CRITERIA_WEIGHTS = {}
except json.JSONDecodeError:
    print(f"Warning: Error decoding criteria_weights.json. Using default weight 1.0.")
    CRITERIA_WEIGHTS = {}


# Default weight for criteria not in the dictionary
DEFAULT_WEIGHT = 1.0

# --- Model Filtering/Naming ---
MODELS_TO_IGNORE = [
    # Add any models you want to exclude from reports and metrics
    'mistralai/ministral-3b',
    'ministral-3b'
]

# Consistent model name substitutions for display purposes
MODEL_NAME_SUBS = {
    'deepseek/deepseek-r1': 'deepseek-ai/DeepSeek-R1',
    'deepseek/deepseek-chat-v3-0324': 'deepseek-ai/DeepSeek-V3-0324',
    'anthropic/claude-3.5-sonnet': 'claude-3-5-sonnet-20241022',
    'openai/chatgpt-4o-latest': 'chatgpt-4o-latest-2025-03-27',
    'anthropic/claude-3.7-sonnet': 'claude-3-7-sonnet-20250219',
    'openai/gpt-4.5-preview': 'gpt-4.5-preview',
    'cohere/command-a': 'CohereForAI/c4ai-command-a-03-2025',
    'anthropic/claude-3.5-haiku': 'claude-3-5-haiku-20241022',
    'google/gemini-2.0-flash-001': 'gemini-2.0-flash-001',
    'openai/gpt-4o-mini': 'gpt-4o-mini',
    'mistralai/mistral-nemo': 'mistralai/Mistral-Nemo-Instruct-2407',
    'mistralai/mistral-small-3.1-24b-instruct': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
    'mistralai/mistral-small-24b-instruct-2501': 'mistralai/Mistral-Small-24B-Instruct-2501',
    'mistralai/ministral-3b': 'ministral-3b',
    'rekaai/reka-flash-3:free': 'RekaAI/reka-flash-3',
    'google/gemini-2.5-pro-preview-03-25': 'gemini-2.5-pro-preview-03-25',
    'openrouter/quasar-alpha': 'quasar-alpha',
    'openrouter/optimus-alpha': 'optimus-alpha',
    'meta-llama/llama-4-scout': 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
    'meta-llama/llama-4-maverick': 'meta-llama/Llama-4-Maverick-17B-128E-Instruct',
    'x-ai/grok-3-beta': 'grok-3-beta',
    'x-ai/grok-3-mini-beta': 'grok-3-mini-beta',
    'openai/gpt-4.1-mini': 'gpt-4.1-mini',
    'openai/gpt-4.1': 'gpt-4.1',
    'openai/gpt-4.1-nano': 'gpt-4.1-nano',
    'google/gemini-2.5-flash-preview': 'gemini-2.5-flash-preview',
    'thudm/glm-4-32b': 'THUDM/GLM-4-32B-0414',
    "qwen/qwen3-4b:free": "qwen/qwen3-4b",
    "qwen/qwen3-1.7b:free": "qwen/qwen3-1.7b",
    "openai/o4-mini": "o4-mini",
    "google/gemini-2.5-pro-preview": "gemini-2.5-pro-preview-2025-05-07",
    "mistralai/mistral-medium-3": "mistral-medium-3",
    "anthropic/claude-opus-4": "claude-opus-4",
    "anthropic/claude-sonnet-4": "claude-sonnet-4",
}

# --- Scoring Configuration (Should match .env used for the run) ---
SCORE_RANGE_MIN = 0
SCORE_RANGE_MAX = 20
NUM_CHAPTERS = int(os.getenv("NUM_CHAPTERS", 8)) # Load from env if set, else default
# Ensure FINAL_SCORE_WEIGHT is loaded or defaulted correctly
try:
    FINAL_SCORE_WEIGHT = float(os.getenv("FINAL_SCORE_WEIGHT", 4.0))
except ValueError:
    FINAL_SCORE_WEIGHT = 4.0 # Default if env var is invalid
    print("Warning: Invalid FINAL_SCORE_WEIGHT in environment. Using default: 4.0")

NUM_PLANNING_STEPS = 5
TOTAL_STEPS = NUM_PLANNING_STEPS + NUM_CHAPTERS
FIRST_CHAPTER_STEP_INDEX = NUM_PLANNING_STEPS + 1

# --- Helper Functions ---

def load_json_file(file_path: str) -> Dict:
    """Load data from a JSON file."""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except json.JSONDecodeError:
        print(f"Error decoding JSON from {file_path}")
        return {}
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return {}

def load_text_file(filepath: str, base_dir: str) -> Optional[str]:
    """Loads text content from a file within the base directory."""
    full_path = os.path.join(base_dir, filepath)
    if not os.path.exists(full_path):
        print(f"Warning: Required data file not found: {full_path}")
        return None
    try:
        with open(full_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Warning: Error reading file {full_path}: {e}")
        return None

def load_list_from_file(filepath: str, base_dir: str) -> List[str]:
    """Loads a list of strings from a file, one item per line."""
    content = load_text_file(filepath, base_dir)
    if content is None:
        return []
    return [line.strip() for line in content.splitlines() if line.strip()]

def sanitize_model_name(model_name: str) -> str:
    """Sanitize model name for use in filenames."""
    sanitized = model_name.replace("/", "__")
    unsafe_chars = r'<>:"|?*\\'
    for char in unsafe_chars:
        sanitized = sanitized.replace(char, '-')
    # Remove leading/trailing underscores/hyphens
    sanitized = sanitized.strip('_-')
    return sanitized

def get_updated_model_name(original: str) -> str:
    """Applies model name substitutions."""
    return MODEL_NAME_SUBS.get(original, original)

# --- Scoring Logic (Simplified version from core/scoring.py for sorting) ---
# Load negative criteria needed for scoring
neg_criteria_chapter = load_list_from_file("longform_negative_criteria_chapter.txt", DATA_DIR)
neg_criteria_final = load_list_from_file("longform_negative_criteria_final.txt", DATA_DIR)

def invert_if_negative(metric: str, score: float, negative_criteria: List[str]) -> float:
    """Inverts score if metric is negative."""
    normalized_metric = metric.lower().strip()
    normalized_neg_criteria = [nc.lower().strip() for nc in negative_criteria]
    if normalized_metric in normalized_neg_criteria:
        # Ensure score is within range before inverting
        score = max(SCORE_RANGE_MIN, min(SCORE_RANGE_MAX, score))
        return SCORE_RANGE_MAX - score
    return score

def calculate_task_overall_score(task_data: Dict[str, Any]) -> Optional[float]:
    """
    Calculates the weighted average score for a single task dictionary,
    factoring in multiple final judgments (final_judge_scores as a list).
    Returns the final combined score in [0..SCORE_RANGE_MAX], or None if no scores.
    Applies negative criteria inversion and special weighting for 'forced poetry'.
    """
    chapter_scores_raw = task_data.get("chapter_judge_scores", {})
    final_scores_list = task_data.get("final_judge_scores", [])  # Expecting a List[Dict[str, float]]

    # --- Calculate Chapter Average ---
    valid_chapter_scores = [] # Stores the weighted average score for each chapter
    for chap_num_str, chap_scores_dict in chapter_scores_raw.items():
        if isinstance(chap_scores_dict, dict) and chap_scores_dict:
            chapter_weighted_sum = 0.0
            total_weight = 0.0
            for metric, value in chap_scores_dict.items():
                if isinstance(value, (int, float)):
                    processed_value = invert_if_negative(metric, value, neg_criteria_chapter)
                    weight = CRITERIA_WEIGHTS.get(metric.lower().strip(), DEFAULT_WEIGHT)
                    # Apply special power scaling for 'forced poetry or metaphor'
                    if metric.lower() == 'forced poetry or metaphor':
                        # Scale score to 0-1, apply power, scale back to 0-20
                        processed_value = ((processed_value / SCORE_RANGE_MAX) ** 1.7) * SCORE_RANGE_MAX

                    chapter_weighted_sum += processed_value * weight
                    total_weight += weight
            if total_weight > 0:
                valid_chapter_scores.append(chapter_weighted_sum / total_weight)

    avg_chapter_score = stats.mean(valid_chapter_scores) if valid_chapter_scores else None

    # --- Calculate Final Average (across multiple final_judge_scores) ---
    final_scores_averages = [] # Stores the weighted average score for each final judgment entry
    if final_scores_list and isinstance(final_scores_list, list):
        for one_final_dict in final_scores_list:
            if one_final_dict and isinstance(one_final_dict, dict):
                final_weighted_sum = 0.0
                total_weight = 0.0
                for metric, value in one_final_dict.items():
                    if isinstance(value, (int, float)):
                        processed_value = invert_if_negative(metric, value, neg_criteria_final)
                        weight = CRITERIA_WEIGHTS.get(metric.lower().strip(), DEFAULT_WEIGHT)
                        # Apply special power scaling for 'forced poetry or metaphor'
                        if metric.lower() == 'forced poetry or metaphor':
                            processed_value = ((processed_value / SCORE_RANGE_MAX) ** 1.7) * SCORE_RANGE_MAX

                        final_weighted_sum += processed_value * weight
                        total_weight += weight
                if total_weight > 0:
                    final_scores_averages.append(final_weighted_sum / total_weight)

    avg_final_score = stats.mean(final_scores_averages) if final_scores_averages else None

    # --- Combine chapter average + final average ---
    if avg_chapter_score is None and avg_final_score is None:
        return None

    total_score = 0.0
    total_wt = 0.0

    # Weight chapter average by the number of chapters that were actually scored
    if avg_chapter_score is not None and valid_chapter_scores:
        # Use NUM_CHAPTERS as the intended weight, assuming all chapters contribute equally
        # If only some chapters were scored, the average `avg_chapter_score` reflects those scored.
        # The weight should reflect the intended contribution of chapters vs final score.
        # Using NUM_CHAPTERS aligns with the idea that chapters collectively have a weight relative to the final score weight.
        total_score += avg_chapter_score * NUM_CHAPTERS
        total_wt += NUM_CHAPTERS

    if avg_final_score is not None:
        total_score += avg_final_score * FINAL_SCORE_WEIGHT
        total_wt += FINAL_SCORE_WEIGHT

    if total_wt == 0:
        return None

    # Final score is the weighted average, clamped to the score range
    final_combined_score = total_score / total_wt
    return max(SCORE_RANGE_MIN, min(SCORE_RANGE_MAX, final_combined_score))


# --- Dynamic Prompt Loading Logic (from core/benchmark.py adaptation) ---
def load_prompt_templates(data_dir: str) -> Dict[int, str]:
    """Loads planning and dynamically generated chapter prompts."""
    prompt_templates = {}
    required_files_found = True

    # Load planning prompts (1-5)
    for i in range(1, NUM_PLANNING_STEPS + 1):
        filename = f"prompt{i}.txt"
        template_content = load_text_file(filename, data_dir)
        if template_content is None: required_files_found = False
        else: prompt_templates[i] = template_content

    # Load chapter prompts
    if NUM_CHAPTERS > 0:
        first_chap_file = "prompt_chapter_first.txt"
        first_chap_content = load_text_file(first_chap_file, data_dir)
        if first_chap_content is None: required_files_found = False
        else: prompt_templates[FIRST_CHAPTER_STEP_INDEX] = first_chap_content

        intermediate_chap_file = "prompt_chapter_intermediate.txt"
        intermediate_chap_template = load_text_file(intermediate_chap_file, data_dir)
        if intermediate_chap_template is None and NUM_CHAPTERS > 1: # Intermediate needed if more than 1 chapter
             # Check if last chapter template is same as intermediate (can happen if NUM_CHAPTERS=1 setup)
             last_chap_file_check = "prompt_chapter_last.txt"
             last_chap_content_check = load_text_file(last_chap_file_check, data_dir)
             if last_chap_content_check != intermediate_chap_template: # Only error if distinct intermediate file is missing AND needed
                 print(f"Error: Intermediate chapter template file missing and needed: {os.path.join(data_dir, intermediate_chap_file)}")
                 required_files_found = False


        last_chap_file = "prompt_chapter_last.txt"
        last_chap_template = load_text_file(last_chap_file, data_dir)
        if last_chap_template is None: required_files_found = False
        elif TOTAL_STEPS >= FIRST_CHAPTER_STEP_INDEX: # Ensure last step index is valid
            try:
                # Format with chapter_number=NUM_CHAPTERS unless it's the very first chapter (step FIRST_CHAPTER_STEP_INDEX)
                chap_num_for_last = NUM_CHAPTERS if TOTAL_STEPS > FIRST_CHAPTER_STEP_INDEX else 1
                prompt_templates[TOTAL_STEPS] = last_chap_template.format(chapter_number=chap_num_for_last)
            except KeyError:
                print(f"Error: Placeholder '{{chapter_number}}' missing in {last_chap_file}.")
                required_files_found = False
        else:
             print("Warning: NUM_CHAPTERS is 0 or invalid, cannot load last chapter prompt.")


        if intermediate_chap_template is not None:
            # Intermediate steps go from FIRST + 1 up to TOTAL - 1
            for step_num in range(FIRST_CHAPTER_STEP_INDEX + 1, TOTAL_STEPS):
                chapter_number = step_num - NUM_PLANNING_STEPS
                try:
                    prompt_templates[step_num] = intermediate_chap_template.format(chapter_number=chapter_number)
                except KeyError:
                    print(f"Error: Placeholder '{{chapter_number}}' missing in {intermediate_chap_file}.")
                    required_files_found = False; break

    if not required_files_found:
        raise FileNotFoundError("One or more required prompt template files were missing or invalid.")

    print(f"Loaded {len(prompt_templates)} prompt templates for {TOTAL_STEPS} steps ({NUM_PLANNING_STEPS} planning, {NUM_CHAPTERS} chapters).")
    return prompt_templates

# Load prompts once globally for the notebook session
try:
    PROMPT_TEMPLATES = load_prompt_templates(DATA_DIR)
except FileNotFoundError as e:
    print(f"\nCRITICAL ERROR: Cannot load prompt templates. Reports cannot be generated. {e}")
    PROMPT_TEMPLATES = {} # Set to empty dict to avoid crashing later calls

# --- NEW HELPER: Calculate Per-Task Chapter/Final Averages (Scaled 0-100) ---
def calculate_task_chapter_final_averages_scaled(task_data: Dict[str, Any]) -> Tuple[Union[str, float], Union[str, float]]:
    """
    Calculates the average chapter item score and average final item score
    for a SINGLE task, scaled to 0-100. Applies negative criteria inversion
    and special 'forced poetry' weighting.
    Returns (scaled_avg_chap_score, scaled_avg_final_score) or "N/A".
    """
    chapter_item_averages = [] # Stores the average item score for each chapter in this task
    chapter_scores_raw = task_data.get("chapter_judge_scores", {})
    for chap_num_str, chap_scores_dict in chapter_scores_raw.items():
        if isinstance(chap_scores_dict, dict) and chap_scores_dict:
            processed_item_scores = []
            for metric, value in chap_scores_dict.items():
                if isinstance(value, (int, float)):
                    processed_value = invert_if_negative(metric, value, neg_criteria_chapter)
                    # Apply special power scaling for 'forced poetry or metaphor'
                    if metric.lower() == 'forced poetry or metaphor':
                        processed_value = ((processed_value / SCORE_RANGE_MAX) ** 1.7) * SCORE_RANGE_MAX
                    processed_item_scores.append(processed_value)
            if processed_item_scores:
                chapter_item_averages.append(stats.mean(processed_item_scores))

    # Average of the per-chapter item averages for this task
    avg_chapter_score_0_20 = stats.mean(chapter_item_averages) if chapter_item_averages else None

    final_item_averages = [] # Stores the average item score for each final judgment entry in this task
    final_scores_list = task_data.get("final_judge_scores", [])
    if final_scores_list and isinstance(final_scores_list, list):
        for one_final_dict in final_scores_list:
             if one_final_dict and isinstance(one_final_dict, dict):
                processed_item_scores = []
                for metric, value in one_final_dict.items():
                    if isinstance(value, (int, float)):
                        processed_value = invert_if_negative(metric, value, neg_criteria_final)
                        # Apply special power scaling for 'forced poetry or metaphor'
                        if metric.lower() == 'forced poetry or metaphor':
                            processed_value = ((processed_value / SCORE_RANGE_MAX) ** 1.7) * SCORE_RANGE_MAX
                        processed_item_scores.append(processed_value)
                if processed_item_scores:
                    final_item_averages.append(stats.mean(processed_item_scores))

    # Average of the per-final-judgment item averages for this task
    avg_final_score_0_20 = stats.mean(final_item_averages) if final_item_averages else None

    # Scale to 0-100
    scale_factor = 100.0 / SCORE_RANGE_MAX
    scaled_avg_chap = round(avg_chapter_score_0_20 * scale_factor, 1) if avg_chapter_score_0_20 is not None else "N/A"
    scaled_avg_final = round(avg_final_score_0_20 * scale_factor, 1) if avg_final_score_0_20 is not None else "N/A"

    return scaled_avg_chap, scaled_avg_final

# ## Report Generation Function

# +
def generate_model_report(model_name: str, run_key: Optional[str] = None, save_to_file: bool = False) -> HTML:
    """
    Generate an HTML report for a specific model's long-form writing task,
    including theme/font selection, dark mode, and step-by-step display.
    Iterations, planning steps, and chapter judge sections are collapsed by default.
    Includes per-task average chapter/final scores in the iteration header.

    Args:
        model_name: The name of the model to generate the report for.
        run_key: Optional specific run key to use. If None, uses the latest run for the model.
        save_to_file: Whether to save the report to an HTML file in RESULTS_DIR.

    Returns:
        An IPython.display.HTML object containing the report.
    """
    if not PROMPT_TEMPLATES:
         return HTML("<h2>Error: Prompt templates could not be loaded. Cannot generate report.</h2>")

    # --- Data Loading ---
    print(f"Generating report for model: {model_name}" + (f" (Run: {run_key})" if run_key else " (Latest Run)"))
    runs_data = load_json_file(RUNS_FILE)

    if not runs_data:
        return HTML(f"<h2>Error: Could not load runs data from {RUNS_FILE}</h2>")

    # Find the correct run key if not provided
    if run_key is None:
        matching_runs = [k for k, v in runs_data.items() if v.get("test_model") == model_name]
        if not matching_runs:
            return HTML(f"<h2>No runs found for model: {model_name}</h2>")
        # Sort runs by key (often includes timestamp or UUID part) to get latest
        matching_runs.sort()
        run_key = matching_runs[-1]
        print(f"Using latest run key found: {run_key}")

    if run_key not in runs_data:
        return HTML(f"<h2>Run key not found: {run_key}</h2>")

    run_data = runs_data[run_key]
    original_model_name = run_data.get("test_model", model_name)
    display_model_name = get_updated_model_name(original_model_name)

    longform_tasks_all_iters = run_data.get("longform_tasks", {})
    if not longform_tasks_all_iters:
        return HTML(f"<h2>No 'longform_tasks' data found for run: {run_key}</h2>")

    # --- Data Processing: Calculate scores and sort iterations ---
    iterations_data = []
    for iter_idx_str, prompts_dict in longform_tasks_all_iters.items():
        # In long-form, each 'prompt_id' under an iteration is a full task run
        for prompt_id, task_data in prompts_dict.items():
            if task_data.get("status") not in ["completed", "judged", "error"]: # Include errored tasks to show them
                 print(f"Skipping task {prompt_id} iter {iter_idx_str} with status {task_data.get('status')}")
                 continue

            overall_score = calculate_task_overall_score(task_data) # Uses weighted logic
            iterations_data.append({
                "iter_idx": int(iter_idx_str),
                "prompt_id": prompt_id,
                "writing_prompt": task_data.get("writing_prompt", "N/A"),
                "task_data": task_data,
                "overall_score_0_20": overall_score,
                # Scale score to 0-100 for display consistency
                "overall_score_0_100": round(overall_score * (100.0 / SCORE_RANGE_MAX), 1) if overall_score is not None else "N/A"
            })

    # Sort iterations primarily by index
    iterations_data.sort(key=lambda x: x["iter_idx"])

    # --- HTML Generation ---
    # (CSS is mostly identical, added small style for judge-header)
    html_output = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Longform Outputs: {display_model_name}</title>
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <style>
            /* --- PASTE THE FULL CSS FROM THE PREVIOUS CREATIVE WRITING NOTEBOOK HERE --- */
            /* Lora (Used for Cozy Headers) */
            @import url('https://fonts.googleapis.com/css2?family=Lora:ital,wght@0,400..700;1,400..700&display=swap');
            /* Merriweather (Used for Modern Headers - fallback) */
            @import url('https://fonts.googleapis.com/css2?family=Merriweather:ital,wght@0,300;0,400;0,700;1,300;1,400;1,700&display=swap');

            :root {{
                --theme-name: 'cozy';
                --font-body-cozy: 'Tiempos Text', Georgia, serif;
                --font-heading-cozy: 'Lora', serif;
                --font-body-modern: 'Inter', sans-serif;
                --font-heading-modern: 'Besley', 'Merriweather', serif;
                --font-ui: 'Lora', sans-serif;
                --font-body: var(--font-body-cozy);
                --font-heading: var(--font-heading-cozy);
                --bg-color: #fdfaf6;
                --text-color: #3a3a3a;
                --header-color: #5c4033;
                --subheader-color: #7a6a60;
                --border-color: #e0dcd1;
                --accent-border-color: #d3c0a5;
                --container-bg: #fffcf7;
                --iter-header-bg: #f5f0e8;
                --iter-header-hover-bg: #ede8de;
                --prompt-header-bg: #faf5ef; /* Changed for step header */
                --prompt-header-hover-bg: #f5f0e8; /* Changed for step header */
                --judge-bg: #f3f6f9;
                --judge-border: #c8d7e6;
                --judge-text: #555;
                --prompt-display-bg: #f9f6f0;
                --toggle-icon-color: #8a7a70;
                --shadow-color: rgba(0, 0, 0, 0.08);
                --link-color: #7a6a60;
                --link-hover-color: #5c4033;
                --select-text-color: var(--subheader-color);
                --select-chevron-color: var(--subheader-color);
                --select-bg: transparent;
                --select-border: none;
                --step-prompt-bg: #fefcf9; /* Slightly different bg for step prompt */
                --step-prompt-border: #e8e4da;
                --judge-header-bg: transparent; /* <<< CHANGE >>> Style for judge header */
                --judge-header-hover-bg: #f5f5f0; /* <<< CHANGE >>> Style for judge header */
                --judge-header-color: var(--subheader-color); /* <<< CHANGE >>> Style for judge header */
            }}
            body.theme-cozy.dark-mode {{
                --bg-color: #2a2527; --text-color: #fff9f2; --header-color: #f7eee0;
                --subheader-color: #e9dfd0; --border-color: #3e3936; --accent-border-color: #6a5349;
                --container-bg: #312c2e; --iter-header-bg: #342e2f; --iter-header-hover-bg: #413935;
                --prompt-header-bg: #312b2d; --prompt-header-hover-bg: #3a3234;
                --judge-bg: #2f3136; --judge-border: #4e4944; --judge-text: #fcf5eb;
                --prompt-display-bg: #302a2c; --toggle-icon-color: #c0b0a0; --shadow-color: #0c0705;
                --link-color: #d0bca8; --link-hover-color: #ebdac5;
                --select-text-color: var(--subheader-color); --select-chevron-color: var(--subheader-color);
                --step-prompt-bg: #353032; --step-prompt-border: #4a4441;
                --judge-header-hover-bg: #3f3a3c; /* <<< CHANGE >>> Dark judge header */
                --judge-header-color: var(--subheader-color); /* <<< CHANGE >>> Dark judge header */
            }}
            body.theme-modern {{
                --theme-name: 'modern';
                --font-body: var(--font-body-modern); --font-heading: var(--font-heading-modern);
                --bg-color: #ffffff; --text-color: #212529; --header-color: #000000;
                --subheader-color: #495057; --border-color: #dee2e6; --accent-border-color: #adb5bd;
                --container-bg: #ffffff; --iter-header-bg: #f8f9fa; --iter-header-hover-bg: #e9ecef;
                --prompt-header-bg: #ffffff; --prompt-header-hover-bg: #f8f9fa;
                --judge-bg: #f1f3f5; --judge-border: #ced4da; --judge-text: #343a40;
                --prompt-display-bg: #f8f9fa; --toggle-icon-color: #6c757d; --shadow-color: rgba(0, 0, 0, 0.1);
                --link-color: #007bff; --link-hover-color: #0056b3;
                --select-text-color: var(--subheader-color); --select-chevron-color: var(--subheader-color);
                --step-prompt-bg: #ffffff; --step-prompt-border: #e9ecef;
                --judge-header-hover-bg: #f1f3f5; /* <<< CHANGE >>> Modern judge header */
                --judge-header-color: var(--subheader-color); /* <<< CHANGE >>> Modern judge header */
            }}
            body.theme-modern.dark-mode {{
                --bg-color: #1a1a1a; --text-color: #e9ecef; --header-color: #ffffff;
                --subheader-color: #adb5bd; --border-color: #495057; --accent-border-color: #6c757d;
                --container-bg: #212529; --iter-header-bg: #343a40; --iter-header-hover-bg: #495057;
                --prompt-header-bg: #2c3034; --prompt-header-hover-bg: #343a40;
                --judge-bg: #343a40; --judge-border: #495057; --judge-text: #ced4da;
                --prompt-display-bg: #343a40; --toggle-icon-color: #adb5bd; --shadow-color: rgba(0, 0, 0, 0.3);
                --link-color: #69b1ff; --link-hover-color: #a8d1ff;
                --select-text-color: var(--subheader-color); --select-chevron-color: var(--subheader-color);
                --step-prompt-bg: #212529; --step-prompt-border: #343a40;
                --judge-header-hover-bg: #40454a; /* <<< CHANGE >>> Modern dark judge header */
                --judge-header-color: var(--subheader-color); /* <<< CHANGE >>> Modern dark judge header */
            }}
            body {{
                font-family: var(--font-body); line-height: 1.7; color: var(--text-color);
                background-color: var(--bg-color); max-width: 900px; margin: 30px auto;
                padding: 40px 50px; border: 1px solid var(--border-color);
                box-shadow: 0 5px 15px var(--shadow-color); transition: background-color 0.3s, color 0.3s, border-color 0.3s;
            }}
            h1, h2, h3, h4 {{
                font-family: var(--font-heading); color: var(--header-color); margin-top: 2em;
                margin-bottom: 0.8em; line-height: 1.3; transition: color 0.3s;
            }}
            h1 {{ text-align: center; font-size: 2.5em; border-bottom: 2px solid var(--accent-border-color); padding-bottom: 15px; margin-bottom: 1.5em; font-weight: 700; transition: border-color 0.3s; font-family: var(--font-ui) !important; }}
            h2 {{ font-size: 1.8em; font-weight: 700; }}
            h3 {{ font-size: 1.4em; font-style: italic; font-weight: 400; color: var(--subheader-color); }}
            h4 {{ font-size: 1.1em; font-weight: bold; color: var(--subheader-color); margin-top: 1.5em; margin-bottom: 0.5em; }} /* Style for step prompts/outputs and Final Eval */
            strong {{ font-weight: bold; color: var(--header-color); transition: color 0.3s; }}
            a {{ color: var(--link-color); text-decoration: none; transition: color 0.3s; }}
            a:hover {{ color: var(--link-hover-color); text-decoration: underline; }}
            .top-controls {{ display: flex; justify-content: space-between; align-items: center; margin-bottom: 20px; padding-bottom: 10px; border-bottom: 1px solid var(--border-color); transition: border-color 0.3s; font-family: var(--font-ui) !important; }}
            .back-button {{ font-family: var(--font-ui) !important; font-size: 1em; color: var(--select-text-color); transition: color 0.3s; }}
            .controls-right {{ display: flex; align-items: center; gap: 15px; }}
            body.theme-cozy.dark-mode {{ box-shadow: 0 5px 20px var(--shadow-color); background-image: linear-gradient(to bottom, #211f21, #232022); }}
            body.theme-cozy.dark-mode .iteration-container {{ box-shadow: 0 2px 8px #000000; border-color: var(--border-color); }}
            body.theme-cozy.dark-mode h1 {{ text-shadow: 0 1px 2px #000000; }}
            body.theme-cozy.dark-mode .content-block {{ border-color: var(--border-color); }}
            body.theme-cozy.dark-mode .prompt-text-display {{ border-left: 3px solid var(--accent-border-color); background-color: #362e2b; }}
            body.theme-cozy.dark-mode .scores-container {{ color: #b0a598; }}
            body.theme-modern {{ padding: 35px 45px; }}
            body.theme-modern h1 {{ font-weight: 600; border-bottom-width: 1px; }}
            body.theme-modern h2 {{ font-weight: 600; }}
            body.theme-modern h3 {{ font-weight: 500; font-style: normal; }}
            body.theme-modern .iteration-header {{ font-weight: 600; }}
            body.theme-modern .prompt-header {{ font-weight: 500; font-style: normal; }}
            body.theme-modern .prompt-text-display {{ border-left-width: 4px; border-radius: 3px; font-style: normal; }}
            body.theme-modern .judge-content {{ border-style: solid; border-width: 1px; }}
            body.theme-modern strong {{ font-weight: 600; }}
            .control-select-wrapper {{ position: relative; display: inline-block; }}
            .control-select {{ font-family: var(--font-ui) !important; font-size: 0.9em; color: var(--select-text-color); background-color: var(--select-bg); border: none; padding: 2px 5px 2px 18px; margin: 0; cursor: pointer; appearance: none; -webkit-appearance: none; -moz-appearance: none; transition: color 0.3s; border-radius: 0; }}
            .control-select:focus {{ outline: none; }}
            .control-select-wrapper::before {{ content: '▼'; font-size: 0.6em; color: var(--select-chevron-color); position: absolute; left: 5px; top: 50%; transform: translateY(-50%); pointer-events: none; transition: color 0.3s; }}
            .control-select option {{ background-color: var(--bg-color); color: var(--text-color); font-family: var(--font-ui); }}
            .mode-toggle {{ display: flex; align-items: center; font-family: var(--font-ui) !important; }}
            .mode-toggle .form-check-input {{ opacity: 0; width: 0; height: 0; position: absolute; }}
            .mode-toggle .form-check-label {{ font-family: var(--font-ui) !important; font-size: 0.9em; color: var(--subheader-color); cursor: pointer; transition: color 0.3s; user-select: none; padding: 2px 5px; }}
            .mode-toggle .form-check-label:hover {{ color: var(--link-hover-color); }}
            .iteration-container {{ margin: 30px 0; border: 1px solid var(--border-color); border-radius: 4px; overflow: hidden; background-color: var(--container-bg); box-shadow: 0 2px 5px rgba(0,0,0,0.05); transition: background-color 0.3s, border-color 0.3s, box-shadow 0.3s; }}
            .iteration-header {{ background: var(--iter-header-bg); padding: 12px 20px; cursor: pointer; position: relative; border-bottom: 1px solid var(--border-color); font-size: 1.2em; font-weight: 700; color: var(--header-color); transition: background-color 0.3s, border-color 0.3s, color 0.3s; }}
            .iteration-header:hover {{ background: var(--iter-header-hover-bg); }}
            .prompt-container {{ border-top: 1px dotted var(--border-color); padding-top: 15px; margin-top: 15px; transition: border-color 0.3s; }} /* Used for steps now */
            .prompt-container:first-child {{ border-top: none; margin-top: 0; padding-top: 0; }}
            .prompt-header {{ background: var(--prompt-header-bg); padding: 10px 20px; cursor: pointer; font-size: 1.1em; font-weight: 400; color: var(--subheader-color); transition: background-color 0.3s, color 0.3s; border-bottom: 1px solid var(--border-color); }} /* Style for collapsible section headers (Planning, Chapters) */
            .prompt-header:hover {{ background: var(--prompt-header-hover-bg); }}
            .content-block {{ padding: 15px 25px; border-top: 1px solid var(--border-color); background-color: var(--container-bg); transition: background-color 0.3s, border-color 0.3s; }}
            .response-content {{ white-space: pre-wrap; font-family: var(--font-body); font-size: 1.05em; line-height: 1.7; margin-bottom: 15px; color: var(--text-color); transition: color 0.3s; }}
            .judge-content {{ white-space: pre-wrap; font-family: var(--font-body); font-size: 1.0em; line-height: 1.6; background: var(--judge-bg); border: 1px dashed var(--judge-border); padding: 10px 15px; margin-top: 10px; border-radius: 3px; color: var(--judge-text); transition: background-color 0.3s, border-color 0.3s, color 0.3s; }}
             /* <<< CHANGE >>> Header for collapsible judge section */
            .judge-header {{ background: var(--judge-header-bg); padding: 6px 10px; margin-top: 10px; cursor: pointer; font-size: 0.95em; font-weight: normal; color: var(--judge-header-color); transition: background-color 0.3s, color 0.3s; border-radius: 3px 3px 0 0; border: 1px solid var(--border-color); border-bottom: none; }}
            .judge-header:hover {{ background: var(--judge-header-hover-bg); }}
            /* <<< CHANGE >>> Adjust judge content border when inside collapsible */
            .collapsible-judge-content .judge-content {{ margin-top: 0; border-radius: 0 0 3px 3px; border-top: none; }}
            .prompt-text-display {{ font-style: italic; color: var(--subheader-color); margin-bottom: 1em; padding: 10px 15px; background-color: var(--prompt-display-bg); border-left: 3px solid var(--accent-border-color); white-space: pre-wrap; font-family: var(--font-body); transition: background-color 0.3s, border-color 0.3s, color 0.3s, font-style 0.3s; }}
            .step-prompt-text {{ /* Specific style for step prompts */
                font-style: normal; /* Less emphasis than initial prompt */
                color: var(--subheader-color);
                margin-bottom: 1em;
                padding: 8px 12px;
                background-color: var(--step-prompt-bg);
                border: 1px solid var(--step-prompt-border);
                border-radius: 3px;
                white-space: pre-wrap;
                font-family: var(--font-body);
                font-size: 0.95em;
                transition: background-color 0.3s, border-color 0.3s, color 0.3s;
            }}
            .collapsible-content {{ display: none; padding: 15px 20px; background-color: var(--container-bg); transition: background-color 0.3s; }}
             /* <<< CHANGE >>> Adjust padding for nested collapsible content */
            .collapsible-content .collapsible-content {{ padding: 10px 15px; }}
            .collapsible-content.planning-steps, .collapsible-content.chapter-steps {{ padding-top: 0; padding-bottom: 0; }} /* Remove padding around step groups */
            .expanded {{ display: block; }}
            .toggle-icon {{ display: inline-block; width: 20px; text-align: center; font-weight: bold; margin-right: 8px; color: var(--toggle-icon-color); transition: color 0.3s; }}
            /* <<< MODIFIED: Style for the new scores container in the header >>> */
            .scores-container {{ margin-left: 15px; font-style: italic; color: var(--subheader-color); font-size: 0.9em; font-weight: normal; }}
            .chapter-scores {{ font-size: 0.9em; margin-top: 5px; font-style: italic; color: var(--subheader-color); }} /* Style for chapter scores (within judge section) */
            .final-scores {{ font-size: 1.0em; margin-top: 8px; font-weight: bold; color: var(--header-color); }} /* Style for final scores (within judge section) */
            h1.main-title, .back-button, .control-select, .form-check-label, .top-controls {{ font-family: var(--font-ui) !important; }}
            .iteration-header, .prompt-header, .judge-header {{ font-family: var(--font-body) !important; }} /* <<< CHANGE >>> Added judge-header */
            @media screen and (max-width: 768px) {{
                body.theme-cozy, body.theme-modern {{ max-width: 100%; margin: 10px 5px; padding: 15px 10px; }}
                body.theme-cozy h1, body.theme-modern h1 {{ font-size: 1.8em; padding-bottom: 10px; margin-bottom: 1em; }}
                body.theme-cozy h2, body.theme-modern h2 {{ font-size: 1.5em; }}
                body.theme-cozy h3, body.theme-modern h3 {{ font-size: 1.2em; }}
                body.theme-cozy .iteration-header, body.theme-modern .iteration-header {{ padding: 10px 12px; font-size: 1.1em; }} /* Adjust font size */
                body.theme-cozy .prompt-header, body.theme-modern .prompt-header {{ padding: 8px 12px; }}
                body.theme-cozy .content-block, body.theme-modern .content-block {{ padding: 10px 12px; }}
                body.theme-cozy .collapsible-content, body.theme-modern .collapsible-content {{ padding: 10px 15px; }} /* Adjust padding */
                body.theme-cozy .top-controls, body.theme-modern .top-controls {{ flex-direction: column; align-items: flex-start; gap: 10px; }}
                body.theme-cozy .controls-right, body.theme-modern .controls-right {{ width: 100%; justify-content: space-between; }}
                /* <<< MODIFIED: Adjust scores container on mobile >>> */
                .scores-container {{ display: block; margin-left: 0; margin-top: 5px; font-size: 0.85em; }}
            }}
        </style>
    </head>
    <body class="theme-cozy">
        <div class="top-controls">
            <div class="nav-left">
                <a href="javascript:history.back()" class="back-button">← Back</a>
            </div>
            <div class="controls-right">
                <div class="control-select-wrapper">
                    <select id="themeSelector" class="control-select" aria-label="Select Theme">
                        <option value="cozy">Cozy</option>
                        <option value="modern">Modern</option>
                    </select>
                </div>
                <div class="control-select-wrapper">
                    <select id="fontSelector" class="control-select" aria-label="Select Font">
                        <option value="tiempos">Tiempos Text</option> <option value="bookerly">Bookerly</option>
                        <option value="bitter">Bitter Pro</option> <option value="roboto">Roboto</option>
                        <option value="inter">Inter</option> <option value="source_sans">Source Sans 3</option>
                        <option value="open_sans">Open Sans</option> <option value="fira_sans">Fira Sans</option>
                        <option value="besley">Besley</option>
                    </select>
                </div>
                <div class="mode-toggle">
                    <input class="form-check-input" type="checkbox" id="darkModeToggle">
                    <label class="form-check-label" for="darkModeToggle" id="toggleLabel">Light</label>
                </div>
            </div>
        </div>

        <h1 class="main-title">{display_model_name}</h1>
    """

    # --- Iteration Loop ---
    if not iterations_data:
         html_output += "<h2>No completed iterations found for this run.</h2>"

    for display_idx, iter_info in enumerate(iterations_data):
        iter_idx = iter_info["iter_idx"]
        prompt_id = iter_info["prompt_id"]
        task_data = iter_info["task_data"]
        overall_score_100 = iter_info["overall_score_0_100"]
        writing_prompt = iter_info["writing_prompt"]
        status = task_data.get("status", "unknown")
        error_msg = task_data.get("error_message")

        # <<< NEW: Calculate per-task chapter/final averages (scaled 0-100) >>>
        scaled_chap_avg, scaled_final_avg = calculate_task_chapter_final_averages_scaled(task_data)
        chap_avg_display = f"Chapter Avg: {scaled_chap_avg}" if scaled_chap_avg != "N/A" else "Chap Avg: N/A"
        final_avg_display = f"Final: {scaled_final_avg}" if scaled_final_avg != "N/A" else "Final Avg: N/A"
        # <<< END NEW >>>

        

        # <<< MODIFIED: Add new averages to the header string using scores-container span >>>
        html_output += f"""
        <div class="iteration-container">
            <div class="iteration-header" onclick="toggleContent('iteration-{iter_idx}-{prompt_id}')">
                <span class="toggle-icon">+</span>
                {writing_prompt['category']} — {writing_prompt['title']} <span class="scores-container">({chap_avg_display} | {final_avg_display})</span>
            </div>
            <div id="iteration-{iter_idx}-{prompt_id}" class="collapsible-content">
                <div class="content-block">
                    <div class="prompt-text-display">
                        <strong>Initial Writing Prompt:</strong><br>{html.escape(writing_prompt['writing_prompt'])}
                    </div>
                    {f'<div style="color: red; margin-bottom: 1em;"><strong>Error:</strong> {html.escape(error_msg)}</div>' if status == "error" else ""}
        """

        # --- Prepare data for steps ---
        step_outputs = task_data.get("step_outputs", {})
        chapter_scores = task_data.get("chapter_judge_scores", {})
        chapter_raw_texts = task_data.get("chapter_raw_judge_text", {})

        # <<< CHANGE: Group Planning Steps into a Collapsible Section >>>
        planning_id = f"planning-{iter_idx}-{prompt_id}"
        html_output += f"""
                    <div class="prompt-header" onclick="toggleContent('{planning_id}')">
                        <span class="toggle-icon">+</span> Planning Phase ({NUM_PLANNING_STEPS} Steps)
                    </div>
                    <div id="{planning_id}" class="collapsible-content planning-steps">
        """
        # --- Planning Steps Loop (1 to NUM_PLANNING_STEPS) ---
        for step_num in range(1, FIRST_CHAPTER_STEP_INDEX):
            step_prompt = PROMPT_TEMPLATES.get(step_num, f"[Prompt for step {step_num} not loaded]")
            # Special handling for first prompt
            if step_num == 1:
                step_prompt = step_prompt.replace("{writing_prompt}", html.escape(writing_prompt['writing_prompt'])) # Escape prompt here

            step_output = step_outputs.get(str(step_num)) # Keys are strings from JSON

            html_output += f"""
                        <div class="prompt-container">
                            <div class="step-prompt-text">{html.escape(step_prompt)}</div>
            """
            if step_output:
                html_output += f"""
                            <div class="response-content">
<strong>Model Output:</strong><br>{html.escape(step_output)}
                            </div>
                """
            elif status not in ['error', 'generating', 'initialized'] and step_num <= task_data.get('current_step', 0):
                 html_output += "<div class='response-content'><i>Output for this step is missing.</i></div>"
            elif status in ['generating', 'initialized'] and step_num > task_data.get('current_step', 0):
                 html_output += "<div class='response-content'><i>Step not yet generated.</i></div>"

            html_output += "</div>" # Close prompt-container (step container)

        html_output += """
                    </div>
        """ # Close planning collapsible content

        # --- Chapter Steps Loop (FIRST_CHAPTER_STEP_INDEX to TOTAL_STEPS) ---
        for step_num in range(FIRST_CHAPTER_STEP_INDEX, TOTAL_STEPS + 1):
            chapter_num = step_num - NUM_PLANNING_STEPS
            step_prompt = PROMPT_TEMPLATES.get(step_num, f"[Prompt for step {step_num} (Chapter {chapter_num}) not loaded]")
            step_output = step_outputs.get(str(step_num))

            html_output += f"""
                    <div class="prompt-container">
                        <h4>Chapter {chapter_num}</h4>
                        <div class="step-prompt-text">{html.escape(step_prompt)}</div>
            """
            if step_output:
                html_output += f"""
                        <div class="response-content">
<strong>Model Output:</strong><br>{html.escape(step_output)}
                        </div>
                """
                # --- Check if this is a chapter step and add COLLAPSIBLE judging ---
                chap_scores_dict = chapter_scores.get(str(chapter_num))
                chap_raw_text = chapter_raw_texts.get(str(chapter_num))

                # <<< CHANGE: Wrap Judge Output in Collapsible Section >>>
                if chap_raw_text: # Show judge response even if scores failed parsing
                    judge_id = f"judge-{iter_idx}-{prompt_id}-chap-{chapter_num}"
                    scores_str_list = []
                    if isinstance(chap_scores_dict, dict):
                        for metric, val in chap_scores_dict.items():
                            # Escape metric name just in case
                            scores_str_list.append(f"{html.escape(metric)}: {val}")

                    html_output += f"""
                        <div class="judge-header" onclick="toggleContent('{judge_id}')">
                           <span class="toggle-icon">+</span> Judge Evaluation (Chapter {chapter_num})
                        </div>
                        <div id="{judge_id}" class="collapsible-content collapsible-judge-content">
                            <div class="judge-content">
<strong>Judge Response:</strong><br>{html.escape(chap_raw_text)}
                            </div>
                        </div>"""
                elif status not in ['error', 'generating', 'initialized'] and step_num <= task_data.get('current_step', 0):
                     # If judging should have happened but didn't
                     html_output += "<div class='judge-content'><i>Chapter judging data not found.</i></div>"

            elif status not in ['error', 'generating', 'initialized'] and step_num <= task_data.get('current_step', 0):
                 html_output += "<div class='response-content'><i>Output for this step is missing.</i></div>"
            elif status in ['generating', 'initialized'] and step_num > task_data.get('current_step', 0):
                 html_output += "<div class='response-content'><i>Step not yet generated.</i></div>"

            html_output += "</div>" # Close prompt-container (chapter step container)


        # --- Display Final Piece Judging (Remains expanded) ---
        final_scores_list = task_data.get("final_judge_scores", [])   # list of dicts
        final_texts_list = task_data.get("final_raw_judge_texts", []) # list of strings

        if final_scores_list or final_texts_list:
            html_output += """
                <hr style='border: none; border-top: 2px solid var(--accent-border-color); margin: 30px 0; transition: border-color 0.3s;'>
<h4>Final Piece Evaluation</h4>
            """
            # Grab the first ones if multiple exist (for display text)
            first_raw_text = final_texts_list[0] if len(final_texts_list) > 0 else ""
            raw_text_html = html.escape(first_raw_text) if first_raw_text else "<i>(No judge text)</i>"

            # Calculate average final score for display (using the per-task helper)
            # We already calculated scaled_final_avg earlier for the header
            avg_final_score_display = ""
            if scaled_final_avg != "N/A":
                 avg_final_score_display = f"<div class='final-scores'>Avg Final Item Score: {scaled_final_avg}/100</div>"

            html_output += f"""
                <div class="judge-content" style="margin-bottom:10px;">
<strong>Judge Response (First Entry):</strong><br>
                {raw_text_html}
                {avg_final_score_display}
                </div>
            """
        elif status == 'completed' or status == 'judged': # Check if completed or judged
            # If completed/judged but no final judgments present
             html_output += """
                <hr style='border: none; border-top: 2px solid var(--accent-border-color); margin: 30px 0; transition: border-color 0.3s;'>
                <h4>Final Piece Evaluation</h4>
                <div class='judge-content'><i>No final judgment data found.</i></div>
            """


        html_output += """
                </div>
            </div>
        </div>
        """
    # --- End Iteration Loop ---

    # --- JavaScript (Identical to before, handles toggling) ---
    html_output += """
        <script>
            // --- PASTE THE FULL JAVASCRIPT FROM THE PREVIOUS CREATIVE WRITING NOTEBOOK HERE ---
            // (No changes needed in JS for the new structure, just ensure IDs are correct)
            const body = document.body;
            const themeSelector = document.getElementById('themeSelector');
            const fontSelector = document.getElementById('fontSelector');
            const darkModeToggle = document.getElementById('darkModeToggle');
            const toggleLabel = document.getElementById('toggleLabel');
            const FONT_MAP = { 'tiempos': "'Tiempos Text', Georgia, serif", 'bookerly': "'Bookerly', Georgia, serif", 'bitter': "'Bitter Pro', Georgia, serif", 'roboto': "'Roboto', sans-serif", 'inter': "'Inter', sans-serif", 'source_sans': "'Source Sans 3', sans-serif", 'open_sans': "'Open Sans', sans-serif", 'fira_sans': "'Fira Sans', sans-serif", 'besley': "'Besley', 'Merriweather', serif" };
            const FONT_DEFINITIONS = { /* ... include full font definitions ... */
                'tiempos': { family: 'Tiempos Text', variants: [ { weight: 400, style: 'normal', url: 'fonts/tiempos_text/TiemposText-Regular.woff2' }, { weight: 400, style: 'italic', url: 'fonts/tiempos_text/TiemposText-RegularItalic.woff2' }, { weight: 700, style: 'normal', url: 'fonts/tiempos_text/TiemposText-Bold.woff2' } ], fallback: 'Georgia, serif' },
                'bookerly': { family: 'Bookerly', variants: [ { weight: 400, style: 'normal', url: 'fonts/bookerly/Bookerly.woff' }, { weight: 400, style: 'italic', url: 'fonts/bookerly/Bookerly Italic.woff' }, { weight: 700, style: 'normal', url: 'fonts/bookerly/Bookerly Bold.woff' } ], fallback: 'Georgia, serif' },
                'bitter': { family: 'Bitter Pro', variants: [ { weight: 400, style: 'normal', url: 'fonts/bitter_pro/BitterPro-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/bitter_pro/BitterPro-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/bitter_pro/BitterPro-Bold.ttf' } ], fallback: 'Georgia, serif' },
                'roboto': { family: 'Roboto', variants: [ { weight: 400, style: 'normal', url: 'fonts/roboto/static/Roboto-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/roboto/static/Roboto-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/roboto/static/Roboto-Bold.ttf' } ], fallback: 'sans-serif' },
                'inter': { family: 'Inter', variants: [ { weight: 400, style: 'normal', url: 'fonts/inter/static/Inter-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/inter/static/Inter-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/inter/static/Inter-Bold.ttf' } ], fallback: 'sans-serif' },
                'source_sans': { family: 'Source Sans 3', variants: [ { weight: 400, style: 'normal', url: 'fonts/source_sans_3/static/SourceSans3-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/source_sans_3/static/SourceSans3-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/source_sans_3/static/SourceSans3-Bold.ttf' } ], fallback: 'sans-serif' },
                'open_sans': { family: 'Open Sans', variants: [ { weight: 400, style: 'normal', url: 'fonts/open_sans/static/OpenSans-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/open_sans/static/OpenSans-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/open_sans/static/OpenSans-Bold.ttf' } ], fallback: 'sans-serif' },
                'fira_sans': { family: 'Fira Sans', variants: [ { weight: 400, style: 'normal', url: 'fonts/fira_sans/FiraSans-Regular.ttf' }, { weight: 400, style: 'italic', url: 'fonts/fira_sans/FiraSans-Italic.ttf' }, { weight: 700, style: 'normal', url: 'fonts/fira_sans/FiraSans-Bold.ttf' } ], fallback: 'sans-serif' },
                'besley': { family: 'Besley', variants: [ { weight: 400, style: 'normal', url: 'fonts/besley/Besley-VariableFont_wght.ttf' }, { weight: 400, style: 'italic', url: 'fonts/besley/Besley-Italic-VariableFont_wght.ttf' } ], fallback: 'serif' }
             };
            const SANS_FONTS = ['roboto', 'inter', 'source_sans', 'open_sans', 'fira_sans'];
            const THEME_DEFAULT_FONTS = { 'cozy': 'tiempos', 'modern': 'inter' };
            const THEME_DEFAULT_HEAD_FONTS = { 'cozy': "'Lora', serif", 'modern': "'Besley', 'Merriweather', serif" };
            const loadedFonts = new Set();
            async function loadFontFace(fontKey) { if (loadedFonts.has(fontKey)) return; const fontDef = FONT_DEFINITIONS[fontKey]; if (!fontDef) { console.warn(`Font definition not found for: ${fontKey}`); return; } try { const fontLoadPromises = fontDef.variants.map(variant => { const fontFace = new FontFace( fontDef.family, `url(${variant.url})`, { weight: variant.weight, style: variant.style } ); return fontFace.load().then(loadedFont => { document.fonts.add(loadedFont); return loadedFont; }); }); await Promise.all(fontLoadPromises); loadedFonts.add(fontKey); console.log(`Loaded font: ${fontDef.family}`); } catch (err) { console.error(`Error loading font ${fontDef.family}:`, err); } }
            function toggleContent(id) { const element = document.getElementById(id); if (!element) return; const isExpanded = element.classList.contains('expanded'); const header = element.previousElementSibling; const toggleIcon = header ? header.querySelector('.toggle-icon') : null; if (isExpanded) { element.classList.remove('expanded'); if (toggleIcon) toggleIcon.textContent = '+'; } else { element.classList.add('expanded'); if (toggleIcon) toggleIcon.textContent = '−'; } }
            const STORAGE_PREFIX = 'longform_viewer_'; /* Changed prefix */
            const KEYS = { THEME: `${STORAGE_PREFIX}theme`, FONT: `${STORAGE_PREFIX}font`, DARK_MODE: `longformViewerDarkModeEnabled` /* Changed key */ };
            function saveSettings(type, value) { localStorage.setItem(KEYS[type], value); }
            function setDarkMode(isDark) { body.classList.toggle('dark-mode', isDark); toggleLabel.textContent = isDark ? 'Dark' : 'Light'; if (darkModeToggle.checked !== isDark) { darkModeToggle.checked = isDark; } saveSettings('DARK_MODE', isDark); }
            function applyTheme(themeName) { body.classList.remove('theme-cozy', 'theme-modern'); body.classList.add(`theme-${themeName}`); if (themeSelector.value !== themeName) { themeSelector.value = themeName; } saveSettings('THEME', themeName); const savedFont = localStorage.getItem(KEYS.FONT); const defaultFont = THEME_DEFAULT_FONTS[themeName] || 'tiempos'; applyFont(savedFont || defaultFont); }
            async function applyFont(fontValue) { await loadFontFace(fontValue); const fontFamily = FONT_MAP[fontValue]; const currentTheme = localStorage.getItem(KEYS.THEME) || 'cozy'; let headingFontFamily = THEME_DEFAULT_HEAD_FONTS[currentTheme]; if (fontFamily) { body.style.setProperty('--font-body', fontFamily); if (currentTheme === 'modern') { headingFontFamily = THEME_DEFAULT_HEAD_FONTS['modern']; } else { headingFontFamily = THEME_DEFAULT_HEAD_FONTS['cozy']; } if (fontValue === 'besley') { headingFontFamily = FONT_MAP['besley']; } body.style.setProperty('--font-heading', headingFontFamily); if (fontSelector.value !== fontValue) { fontSelector.value = fontValue; } saveSettings('FONT', fontValue); } else { console.warn("Font value not found:", fontValue); const theme = localStorage.getItem(KEYS.THEME) || 'cozy'; applyFont(THEME_DEFAULT_FONTS[theme]); } }
            darkModeToggle.addEventListener('change', function() { setDarkMode(this.checked); });
            themeSelector.addEventListener('change', function() { applyTheme(this.value); });
            fontSelector.addEventListener('change', function() { applyFont(this.value); });
            async function applyInitialSettings() { const savedDarkMode = localStorage.getItem(KEYS.DARK_MODE); const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches; setDarkMode(savedDarkMode !== null ? (savedDarkMode === 'true') : prefersDark); const savedTheme = localStorage.getItem(KEYS.THEME) || 'cozy'; applyTheme(savedTheme); const savedFont = localStorage.getItem(KEYS.FONT) || THEME_DEFAULT_FONTS[savedTheme]; await applyFont(savedFont); fontSelector.value = savedFont || THEME_DEFAULT_FONTS[savedTheme]; }
            applyInitialSettings();
            window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', event => { if (localStorage.getItem(KEYS.DARK_MODE) === null) { setDarkMode(event.matches); } });
        </script>
    </body>
    </html>
    """

    # Save to file if requested
    if save_to_file:
        if not os.path.exists(RESULTS_DIR):
            os.makedirs(RESULTS_DIR)
            print(f"Created results directory: {RESULTS_DIR}")
        sanitized_name = sanitize_model_name(display_model_name) # Use display name for file
        filename = os.path.join(RESULTS_DIR, f"{sanitized_name}_longform_report.html")
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(html_output)
            print(f"Report saved to {filename}")
        except IOError as e:
            print(f"Error saving report to {filename}: {e}")
        except Exception as e:
             print(f"Unexpected error saving report: {e}")

    return HTML(html_output)
# -

# ## Metrics Calculation Function

# --- ADDED: compute top-5 nearest models (via combined Jaccard features) ---
def calculate_combined_jaccard_similarities(results_data: Dict[str, Dict], top_n: int = 1500):
    """
    Builds a presence/absence matrix of "combined" features (top_repetitive_words,
    bigrams, trigrams) for each model, computes pairwise Jaccard distances,
    and stores each model’s top 5 neighbors under 'top_5_similar'.

    Args:
        results_data: The dictionary containing model metrics and features.
        top_n: The approximate total number of features (words+bigrams+trigrams)
               to consider per model for the similarity calculation.
    """
    model_names = list(results_data.keys())
    model_to_features = {}
    print(f"Calculating Jaccard similarities based on top ~{top_n} combined features...")

    for m in model_names:
        info = results_data[m]
        words = info.get("top_repetitive_words", [])
        bigrams = info.get("top_multi_prompt_bigrams", [])
        trigrams = info.get("top_multi_prompt_trigrams", [])

        # Divide top_n budget among feature types
        w_count = top_n // 3
        b_count = top_n // 3
        t_count = top_n // 3

        # Sort features by relevance (score for words, frequency for n-grams) and take top N
        words_sorted = sorted(words, key=lambda x: x.get("score", 0), reverse=True)[:w_count]
        bigrams_sorted = sorted(bigrams, key=lambda x: x.get("frequency", 0), reverse=True)[:b_count]
        trigrams_sorted = sorted(trigrams, key=lambda x: x.get("frequency", 0), reverse=True)[:t_count]

        # Extract the feature strings (word or ngram)
        word_set = set(x["word"] for x in words_sorted)
        bigram_set = set(x["ngram"] for x in bigrams_sorted)
        trigram_set = set(x["ngram"] for x in trigrams_sorted)

        # Combine into a single set for the model
        combined_set = word_set.union(bigram_set).union(trigram_set)
        model_to_features[m] = combined_set

    all_models = sorted(model_names) # Ensure consistent order for matrix
    global_vocab = set()
    for feats in model_to_features.values():
        global_vocab.update(feats)

    if len(all_models) < 2 or not global_vocab:
        print("  Skipping similarity calculation: Not enough models or features.")
        return # Cannot calculate pairwise distance

    global_vocab = sorted(list(global_vocab)) # Consistent column order
    print(f"  Building feature matrix ({len(all_models)} models x {len(global_vocab)} unique features)...")

    # Create a DataFrame (presence/absence matrix)
    df = pd.DataFrame(0, index=all_models, columns=global_vocab, dtype=np.uint8) # Use uint8 for memory
    for m in all_models:
        for ft in model_to_features[m]:
            if ft in df.columns: # Check if feature is in the global vocab (should always be)
                df.loc[m, ft] = 1

    # Calculate pairwise Jaccard distances
    print("  Calculating pairwise Jaccard distances...")
    # pdist returns a condensed distance matrix (1D array)
    dist_array = pdist(df.values, metric="jaccard")
    # squareform converts the condensed matrix into a square matrix
    dist_matrix = squareform(dist_array)

    # Find and store top 5 neighbors for each model
    print("  Finding top 5 neighbors for each model...")
    for i, m in enumerate(all_models):
        # Get distances from model 'm' (row i) to all other models
        row_dist = dist_matrix[i, :]
        # Create list of (neighbor_model, distance) pairs, excluding self (distance=0)
        pair_list = []
        for j in range(len(all_models)):
            if i != j:
                neighbor_model = all_models[j]
                distance = row_dist[j]
                # Handle potential NaN distances if a model has no features in common with vocab
                if not np.isnan(distance):
                    pair_list.append((neighbor_model, distance))

        # Sort by distance (ascending)
        pair_list.sort(key=lambda x: x[1])
        # Get the top 5
        top_5 = pair_list[:5]

        # Store the results in the main dictionary
        # Use updated model names for display in the slop profile
        results_data[m]["top_5_similar"] = [
            {"model": get_updated_model_name(neighbor_name), "distance": float(dist_val)}
            for neighbor_name, dist_val in top_5
        ]
    print("  Finished calculating model similarities.")


# --- REVISED HELPER: Calculate Global Chapter/Final Averages (0-20) for CSV ---
def compute_global_chapter_final_averages(model_name: str, runs_data: Dict) -> Tuple[List[Union[float, str]], Union[float, str]]:
    """
    Calculates the global average item score for each chapter (1-8) and the final judgment
    across all tasks for a given model. Applies negative criteria inversion and special
    'forced poetry' weighting.
    Returns a tuple: (list_of_8_chapter_averages_0_20, final_average_0_20).
    Values are floats or "N/A".
    """
    # Store sum and count of per-task item averages for each chapter/final
    sum_task_chap_avgs = [0.0] * NUM_CHAPTERS
    count_task_chap_avgs = [0] * NUM_CHAPTERS
    sum_task_final_avgs = 0.0
    count_task_final_avgs = 0

    for run_key, run_dict in runs_data.items():
        if run_dict.get("test_model") != model_name:
            continue

        longform_tasks = run_dict.get("longform_tasks", {})
        for iter_idx, iteration_data in longform_tasks.items():
            for prompt_id, task_data in iteration_data.items():
                # --- Process Chapter Scores for this task ---
                chapter_scores_raw = task_data.get("chapter_judge_scores", {})
                for chap_num_str, chap_score_dict in chapter_scores_raw.items():
                    try:
                        # Chapter numbers are 1-based in keys, convert to 0-based index
                        cidx = int(chap_num_str) - 1
                        if not (0 <= cidx < NUM_CHAPTERS): continue
                    except ValueError:
                        continue

                    if isinstance(chap_score_dict, dict) and chap_score_dict:
                        processed_item_scores = []
                        for metric, value in chap_score_dict.items():
                            if isinstance(value, (int, float)):
                                processed_value = invert_if_negative(metric, value, neg_criteria_chapter)
                                # Apply special power scaling for 'forced poetry or metaphor'
                                if metric.lower() == 'forced poetry or metaphor':
                                    processed_value = ((processed_value / SCORE_RANGE_MAX) ** 1.7) * SCORE_RANGE_MAX
                                processed_item_scores.append(processed_value)

                        if processed_item_scores:
                            task_chapter_item_avg = stats.mean(processed_item_scores)
                            sum_task_chap_avgs[cidx] += task_chapter_item_avg
                            count_task_chap_avgs[cidx] += 1

                # --- Process Final Scores for this task ---
                final_scores_list = task_data.get("final_judge_scores", [])
                task_final_item_averages = [] # Collect averages from potentially multiple final judgments in one task
                if final_scores_list and isinstance(final_scores_list, list):
                    for one_final_dict in final_scores_list:
                        if one_final_dict and isinstance(one_final_dict, dict):
                            processed_item_scores = []
                            for metric, value in one_final_dict.items():
                                if isinstance(value, (int, float)):
                                    processed_value = invert_if_negative(metric, value, neg_criteria_final)
                                    # Apply special power scaling for 'forced poetry or metaphor'
                                    if metric.lower() == 'forced poetry or metaphor':
                                        processed_value = ((processed_value / SCORE_RANGE_MAX) ** 1.7) * SCORE_RANGE_MAX
                                    processed_item_scores.append(processed_value)
                            if processed_item_scores:
                                task_final_item_averages.append(stats.mean(processed_item_scores))

                # If we got any valid final item averages for this task, average them and add to global sum/count
                if task_final_item_averages:
                    overall_task_final_avg = stats.mean(task_final_item_averages)
                    sum_task_final_avgs += overall_task_final_avg
                    count_task_final_avgs += 1

    # --- Calculate Global Averages ---
    global_chapter_avgs_0_20 = []
    # Ensure we generate exactly 8 chapter averages for the CSV header consistency
    target_num_chapters_csv = 8
    for i in range(target_num_chapters_csv):
        if i < NUM_CHAPTERS and count_task_chap_avgs[i] > 0: # Check if index is valid for actual NUM_CHAPTERS
            avg = sum_task_chap_avgs[i] / count_task_chap_avgs[i]
            # Clamp average to score range before rounding
            clamped_avg = max(SCORE_RANGE_MIN, min(SCORE_RANGE_MAX, avg))
            global_chapter_avgs_0_20.append(round(clamped_avg, 2))
        else:
            global_chapter_avgs_0_20.append("N/A")

    if count_task_final_avgs > 0:
        avg_final = sum_task_final_avgs / count_task_final_avgs
        # Clamp average to score range before rounding
        clamped_avg_final = max(SCORE_RANGE_MIN, min(SCORE_RANGE_MAX, avg_final))
        global_final_avg_0_20 = round(clamped_avg_final, 2)
    else:
        global_final_avg_0_20 = "N/A"

    return global_chapter_avgs_0_20, global_final_avg_0_20

# +
def calculate_and_print_metrics(save_updated_results: bool = True, print_slop_profile: bool = True):
    """
    Calculates aggregated metrics (length, vocab, slop, repetition), extracts top
    multi-prompt N-grams, calculates global chapter/final item averages, and computes
    model similarity based *only* on the generated chapter text.
    Merges metrics into a results dictionary, prints results in CSV format, optionally saves
    the updated results, and optionally prints the formatted slop profile string.
    """
    if not metrics_available:
        print("Metrics functions not available. Skipping calculation.")
        return

    print("\nCalculating aggregated metrics, N-grams, and scores from chapter text...")
    runs_data = load_json_file(RUNS_FILE)
    # We don't use ELO data here, we build the results dict from runs + metrics
    results_with_metrics = {} # { model_name: { metric_key: value, ... } }

    if not runs_data:
        print(f"Runs data file ('{RUNS_FILE}') is empty or not found. Cannot calculate metrics.")
        return

    # Structure: { model_name: [(chapter_text, unique_id), ...], ... }
    model_chapters_with_ids = defaultdict(list)
    # Structure: { model_name: [chapter_text1, chapter_text2, ...], ... }
    model_all_chapters_flat = defaultdict(list)
    # Structure: { model_name: { prompt_id: [chap1_text, chap2_text, ...], ... }, ... } # For N-gram context
    model_chapters_by_prompt = defaultdict(lambda: defaultdict(list))

    print("Extracting chapter text from runs...")
    processed_runs = 0
    for run_key, run_data in runs_data.items():
        model_name = run_data.get("test_model")
        if not model_name or model_name in MODELS_TO_IGNORE: continue

        longform_tasks = run_data.get("longform_tasks", {})
        if not longform_tasks: continue

        run_has_chapters = False
        for iter_idx_str, prompts_dict in longform_tasks.items():
            for prompt_id, task_data in prompts_dict.items():
                step_outputs = task_data.get("step_outputs", {})
                # Iterate through chapter steps only
                for step_num in range(FIRST_CHAPTER_STEP_INDEX, TOTAL_STEPS + 1):
                    chapter_text = step_outputs.get(str(step_num))
                    if isinstance(chapter_text, str) and chapter_text.strip():
                        unique_id = f"{prompt_id}_{iter_idx_str}_{step_num}" # ID for repetition tracking
                        model_chapters_with_ids[model_name].append((chapter_text, unique_id))
                        model_all_chapters_flat[model_name].append(chapter_text)
                        model_chapters_by_prompt[model_name][prompt_id].append(chapter_text)
                        run_has_chapters = True
        if run_has_chapters: processed_runs += 1

    print(f"Extracted chapter text for {len(model_chapters_with_ids)} models from {processed_runs} runs.")
    if not model_chapters_with_ids:
        print("No chapter text found in any run. Cannot calculate metrics.")
        return

    print("Calculating metrics and extracting N-grams per model...")
    # --- Iterate through models ---
    for model_name in list(model_chapters_with_ids.keys()): # Use list to avoid dict size change issues if needed
        chapters_with_ids = model_chapters_with_ids[model_name]
        all_chapters_flat = model_all_chapters_flat[model_name]
        chapters_by_prompt = model_chapters_by_prompt[model_name]

        print(f"\n  Processing {model_name} ({len(all_chapters_flat)} total chapters from {len(chapters_by_prompt)} initial prompts)...")

        # --- Calculate Basic Metrics (Length, Vocab, Slop) ---
        num_chapters = len(all_chapters_flat)
        if num_chapters == 0:
            print(f"      Skipping basic metrics: No chapter text found for {model_name}.")
            avg_length = 0.0
            vocab_complexity = 'N/A'
            slop_score = 'N/A'
        else:
            total_chars = sum(len(text) for text in all_chapters_flat)
            avg_length = round(total_chars / num_chapters, 1)
            all_text_combined = "\n\n".join(all_chapters_flat)

            if not all_text_combined.strip():
                 print("      Warning: Combined chapter text is empty. Cannot calculate vocab/slop.")
                 vocab_complexity = 'N/A'
                 slop_score = 'N/A'
            else:
                try:
                    vocab_complexity = calculate_complexity_index(all_text_combined)
                    vocab_complexity = round(vocab_complexity, 2) if isinstance(vocab_complexity, (int, float)) else 'Error'
                except Exception as e:
                    print(f"      ERROR calculating vocab complexity for {model_name}: {e}")
                    vocab_complexity = 'Error'
                try:
                    # Pass debug=False to avoid excessive console output here
                    slop_score = calculate_slop_index_new(all_text_combined, debug=False)
                    slop_score = round(slop_score, 2) if isinstance(slop_score, (int, float)) else 'Error'
                except Exception as e:
                    print(f"      ERROR calculating slop score for {model_name}: {e}")
                    slop_score = 'Error'
        print(f"    Metrics - Avg Chapter Len: {avg_length:.0f}, Vocab Complexity: {vocab_complexity}, Slop: {slop_score}")

        # --- Calculate Repetition/N-grams (Requires multiple prompts) ---
        repetition_score = 0.0 # N-gram based score
        top_repetitive_words = []
        top_bigrams = []
        top_trigrams = []
        has_multi_prompt_data = len(chapters_by_prompt) >= 2

        if has_multi_prompt_data and chapters_with_ids:
            print("      Calculating N-grams and Repetitive Words (multi-prompt)...")

            # Calculate total text length for normalization
            total_text_length = 0
            for text, _ in chapters_with_ids:
                if isinstance(text, str):
                    total_text_length += len(text.split())

            try:
                # N-grams need context of which prompt they came from
                top_bigram_count = 0
                top_trigram_count = 0

                top_bigrams = get_multi_prompt_ngrams(chapters_by_prompt, n=2, top_k=200, min_prompt_ids=2) # Increased top_k for similarity calc
                if top_bigrams:
                    top_bigram_count = sum(freq for _, freq in top_bigrams[:40]) # Count based on top 40

                top_trigrams = get_multi_prompt_ngrams(chapters_by_prompt, n=3, top_k=200, min_prompt_ids=2) # Increased top_k for similarity calc
                if top_trigrams:
                    top_trigram_count = sum(freq for _, freq in top_trigrams[:40]) # Count based on top 40

                # Calculate normalized repetition score
                if total_text_length > 0:
                    repetition_score = (top_bigram_count + top_trigram_count) / total_text_length * 1000
                else:
                    repetition_score = 0

                print(f"        N-gram Rep Score (normalized by text length * 1000): {repetition_score:.4f}. Found {len(top_bigrams)} Bigrams, {len(top_trigrams)} Trigrams.")
            except Exception as e:
                print(f"      ERROR calculating N-grams for {model_name}: {e}")
                repetition_score = 'Error'

            try:
                # Repetitive words use the flat list with unique IDs
                # Fix the IDs to represent the initial prompt, not the step
                new_texts_with_ids = []
                for (text, old_id) in chapters_with_ids:
                    # old_id looks like "promptID_iteration_step"
                    prompt_id_str = old_id.split("_", 1)[0]
                    new_texts_with_ids.append((text, prompt_id_str))

                # Use the corrected list
                top_repetitive_words = get_top_repetitive_words(new_texts_with_ids, top_n=1000, min_prompt_ids=2) # Increased top_n for similarity calc
                print(f"        Found {len(top_repetitive_words)} multi-prompt repetitive words.")
            except Exception as e:
                print(f"      ERROR extracting repetitive words for {model_name}: {e}")
                top_repetitive_words = [] # Set to empty on error

        elif not has_multi_prompt_data:
            print(f"      Skipping N-grams & Repetition: Only chapters from 1 initial prompt found.")
        else: # No valid chapter text with IDs
            print(f"      Skipping N-grams & Repetition: No valid chapter text entries found.")

        # --- Store Metrics ---
        results_with_metrics[model_name] = {
            'avg_chapter_length': avg_length,
            'vocab_complexity': vocab_complexity,
            'slop_score': slop_score,
            'repetition_score': round(repetition_score, 1) if isinstance(repetition_score, (int, float)) else str(repetition_score),
            'top_repetitive_words': [{"word": word, "score": float(score)} for word, score in top_repetitive_words],
            'top_multi_prompt_bigrams': [{"ngram": ' '.join(ngram), "frequency": int(freq)} for ngram, freq in top_bigrams],
            'top_multi_prompt_trigrams': [{"ngram": ' '.join(ngram), "frequency": int(freq)} for ngram, freq in top_trigrams]
        }

        # --- Print Summary for Model ---
        if top_repetitive_words:
            filtered_top_words = top_repetitive_words[:10] # Limit printout in console
            print(f"    Top multi-prompt repetitive words: " + ", ".join([f"{word} ({score:.1f}x)" for word, score in filtered_top_words]))
        elif has_multi_prompt_data:
             print("    No words met the multi-prompt repetition criteria.")

        if top_bigrams:
            print("    Top multi-prompt Bigrams:")
            for bg, freq in top_bigrams[:5]: # Limit printout
                print(f"      - {' '.join(bg)} ({freq})")
        elif has_multi_prompt_data:
            print("    No bigrams met the multi-prompt criteria.")

        if top_trigrams:
            print("    Top multi-prompt Trigrams:")
            for tg, freq in top_trigrams[:5]: # Limit printout
                print(f"      - {' '.join(tg)} ({freq})")
        elif has_multi_prompt_data:
            print("    No trigrams met the multi-prompt criteria.")


    # --- Add Overall Scores from Runs Data ---
    print("\nAdding overall benchmark scores from runs data...")
    # Use the same runs_data loaded earlier
    for run_key, run_data in runs_data.items():
         model_name = run_data.get("test_model")
         if not model_name or model_name not in results_with_metrics: continue

         # Find the calculated benchmark results within the run data
         bench_results = run_data.get("results", {}).get("benchmark_results", {})
         overall_score_100 = bench_results.get("eqbench_longform_score_0_100")
         overall_score_20 = bench_results.get("overall_score_0_20")

         # Add scores to our results dict (potentially overwriting if multiple runs exist - last one wins)
         if overall_score_100 is not None:
             results_with_metrics[model_name]['overall_score_100'] = overall_score_100
         if overall_score_20 is not None:
             results_with_metrics[model_name]['overall_score_20'] = overall_score_20


    # --- Set default values for models missing metrics or scores ---
    all_model_names = set(results_with_metrics.keys())
    default_values = {
        'avg_chapter_length': 0.0, 'vocab_complexity': 'N/A', 'slop_score': 'N/A',
        'repetition_score': 0.0, 'top_repetitive_words': [],
        'top_multi_prompt_bigrams': [], 'top_multi_prompt_trigrams': [],
        'overall_score_100': 'N/A', 'overall_score_20': 'N/A',
        'top_5_similar': [] # --- ADDED: Default for similarity ---
    }
    for model_name in all_model_names:
        for key, default_value in default_values.items():
            results_with_metrics[model_name].setdefault(key, default_value)


    # --- Print CSV Results ---
    print("\n--- Aggregated Metrics & Scores (CSV Format) ---")
    # <<< MODIFIED: Update header columns >>>
    header_cols = [
        "model_name",
        "overall_score_100",
        "avg_chapter_length",
        "vocab_complexity",
        "slop_score",
        "repetition_score",
    ]
    # Add the 8 chapter columns + final (using fixed 8 for header)
    for i in range(1, 8 + 1):
        header_cols.append(f"chapter{i}_avg")
    header_cols.append("final_judgement_avg")
    # <<< END MODIFIED >>>

    print(",".join(header_cols))

    # Sort models as before
    sorted_models = sorted(
        results_with_metrics.items(),
        key=lambda item: item[1].get("overall_score_100", -float('inf'))
            if isinstance(item[1].get("overall_score_100"), (int, float)) else -float('inf'),
        reverse=True
    )

    # Use the runs_data loaded at the start of the function
    if not runs_data:
        print("Warning: Could not load runs data for calculating chapter/final averages for CSV.")

    for model_name, data in sorted_models:
        if model_name in MODELS_TO_IGNORE:
            continue

        updated_name = get_updated_model_name(model_name)

        score_100 = data.get('overall_score_100', 'N/A')
        score_100_display = f"{score_100:.1f}" if isinstance(score_100, (int, float)) else 'N/A'
        avg_len = data.get('avg_chapter_length', 'N/A')
        avg_len_display = f"{avg_len:.0f}" if isinstance(avg_len, (int, float)) else 'N/A'
        vocab = data.get('vocab_complexity', 'N/A')
        vocab_display = f"{float(vocab):.2f}" if isinstance(vocab, (int, float)) else str(vocab)
        slop = data.get('slop_score', 'N/A')
        slop_display = f"{float(slop):.2f}" if isinstance(slop, (int, float)) else str(slop)
        repetition = data.get('repetition_score', 'N/A')
        repetition_display = f"{float(repetition):.1f}" if isinstance(repetition, (int, float)) else str(repetition)

        # <<< MODIFIED: Gather the new 9 columns using the helper >>>
        if runs_data:
            # Pass the loaded runs_data here
            c_avgs_0_20, final_avg_0_20 = compute_global_chapter_final_averages(model_name, runs_data)
        else:
            # Default if runs data failed to load
            c_avgs_0_20 = ["N/A"] * 8 # Default to 8 N/A values
            final_avg_0_20 = "N/A"

        c_avgs_str = []
        for val in c_avgs_0_20: # Should be length 8 now
            # Format to 2 decimal places if float, else keep "N/A"
            c_avgs_str.append(f"{val:.2f}" if isinstance(val, float) else "N/A")

        final_avg_str = f"{final_avg_0_20:.2f}" if isinstance(final_avg_0_20, float) else "N/A"
        # <<< END MODIFIED >>>

        safe_model_name = f"\"{updated_name}\"" if ',' in updated_name else updated_name

        # <<< MODIFIED: Construct the row with new values >>>
        row_values = [
            safe_model_name,
            score_100_display,
            avg_len_display,
            vocab_display,
            slop_display,
            repetition_display,
        ]
        row_values.extend(c_avgs_str)  # chapter1..chapter8 averages
        row_values.append(final_avg_str) # final average
        # <<< END MODIFIED >>>

        print(",".join(row_values))


    # --- ADDED: call combined Jaccard similarity function ---
    # Calculate similarities *after* all metrics are populated
    calculate_combined_jaccard_similarities(results_with_metrics, top_n=1500)


    # --- Save Updated Results Data ---
    if save_updated_results:
        print(f"\nSaving combined results with metrics (and similarity) to {ELO_RESULTS_WITH_METRICS_FILE}...")
        try:
            # Ensure results directory exists
            os.makedirs(os.path.dirname(ELO_RESULTS_WITH_METRICS_FILE), exist_ok=True)
            with open(ELO_RESULTS_WITH_METRICS_FILE, 'w', encoding='utf-8') as f:
                json.dump(results_with_metrics, f, indent=2, ensure_ascii=False)
            print("Save successful.")
        except IOError as e:
            print(f"Error saving updated results data to {ELO_RESULTS_WITH_METRICS_FILE}: {e}")
        except TypeError as e:
             print(f"Error serializing updated results data to JSON: {e}. Check for non-serializable types.")


    # --- Generate and Print Slop Profile String ---
    if print_slop_profile:
        print("\n--- Generating Slop Profile String for JS ---")
        try:
            # Assuming format_slop_profile_string is defined and updated
            slop_profile_output = format_slop_profile_string(results_with_metrics)
            print("\n----- BEGIN SLOP PROFILE STRING -----")
            print(slop_profile_output)
            print("----- END SLOP PROFILE STRING -----\n")
        except NameError:
            print("`format_slop_profile_string` function not defined. Skipping slop profile string generation.")
        except Exception as e:
            print(f"Error generating slop profile string: {e}")

# --- Slop Profile Formatting Function (Adapted from Creative Writing) ---
def format_slop_profile_string(results_data: Dict[str, Dict]) -> str:
    """
    Formats repetitive word, n-gram, and similarity data into a single multi-line string
    with HTML formatting, suitable for embedding in JS.
    """
    output_string = ""
    # Sort models by overall score (matching the CSV output)
    sorted_models = sorted(
        results_data.items(),
        key=lambda item: item[1].get("overall_score_100", -float('inf'))
            if isinstance(item[1].get("overall_score_100"), (int, float)) else -float('inf'),
        reverse=True
    )

    for model_name, data in sorted_models:
        if model_name in MODELS_TO_IGNORE: continue
        updated_name = get_updated_model_name(model_name)
        output_string += f"##### {updated_name}\n"

        # --- ADDED: Model Similarity Section ---
        top_5 = data.get("top_5_similar", [])
        if top_5:
            output_string += "<h4>Most Similar To:</h4>\n"
            output_string += "<div class='slop-similar-section'>\n"
            for item in top_5:
                # Ensure distance is formatted correctly
                dist_str = f"{item.get('distance', float('nan')):.3f}"
                output_string += f"<div class='slop-similar'>{html.escape(item.get('model', 'N/A'))} (distance={dist_str})</div>\n"
            output_string += "</div>\n"
            output_string += "\n" # Add spacing after similarity section

        # --- Top Repetitive Words ---
        rep_words = data.get('top_repetitive_words', [])
        output_string += "<h4>Top Repetitive Words</h4>\n"
        if rep_words:
            output_string += "<div class='slop-section-items'>\n"
            items_html = [f"<span class='slop-word-item'>{html.escape(item.get('word', 'N/A'))}</span>" for item in rep_words[:50]]
            output_string += " ".join(items_html) + "\n</div>\n"
        else: output_string += "<p><i>No multi-prompt repetitive words found.</i></p>\n"

        # --- Top Multi-Prompt Bigrams ---
        bigrams = data.get('top_multi_prompt_bigrams', [])
        output_string += "<h4>Top Bigrams</h4>\n"
        if bigrams:
            output_string += "<div class='slop-section-items'>\n"
            items_html = [f"<span class='slop-ngram-item'>{html.escape(item.get('ngram', 'N/A'))} ({item.get('frequency', 0)})</span>" for item in bigrams[:30]]
            output_string += " ".join(items_html) + "\n</div>\n"
        else: output_string += "<p><i>No multi-prompt bigrams found.</i></p>\n"

        # --- Top Multi-Prompt Trigrams ---
        trigrams = data.get('top_multi_prompt_trigrams', [])
        output_string += "<h4>Top Trigrams</h4>\n"
        if trigrams:
            output_string += "<div class='slop-section-items'>\n"
            items_html = [f"<span class='slop-ngram-item'>{html.escape(item.get('ngram', 'N/A'))} ({item.get('frequency', 0)})</span>" for item in trigrams[:30]]
            output_string += " ".join(items_html) + "\n</div>\n"
        else: output_string += "<p><i>No multi-prompt trigrams found.</i></p>\n"

        output_string += "\n" # Add a blank line between models
    return output_string.strip()
# -

# ## Utility Functions for Listing/Viewing
# (No changes needed in this section)

# +
def list_available_models():
    """List all models available in the results file."""
    # Reads the combined results file now
    results_data = load_json_file(RUNS_FILE) # Use the correct file name
    if False:
        if not results_data:
            # Fallback to runs file if combined file doesn't exist
            #print(f"Warning: '{ELO_RESULTS_WITH_METRICS_FILE}' not found. Listing models from '{RUNS_FILE}'.")
            runs_data = load_json_file(RUNS_FILE)
            if not runs_data:
                print("No runs data found.")
                return []
            models_in_runs = set(v.get("test_model") for v in runs_data.values() if v.get("test_model"))
            models = [(name, 'N/A') for name in sorted(list(models_in_runs))] # No score info here
        #else:
    models_with_scores = []
    for run_id, model_data in results_data.items():
        model_name = model_data['test_model']
        score = model_data.get('results', {}).get('benchmark_results', {}).get('eqbench_longform_score_0_100', float('-inf')) #.get("overall_score_100", -float('inf'))
        models_with_scores.append((model_name, score))
    # Sort by score
    #models_with_scores.sort(key=lambda x: x[1] if isinstance(x[1], (int, float)) else -float('inf'), reverse=True)
    models = models_with_scores # Use the sorted list

    print("Available models (Sorted by Overall Score):")
    displayed_count = 0
    # Store models excluding ignored ones
    available_models_list = []
    for rank, (name, score) in enumerate(models, 1):
        if name in MODELS_TO_IGNORE: continue
        score_display = f"{score:.1f}" if isinstance(score, (int, float)) else "N/A"
        print(f"{rank}. {get_updated_model_name(name)} (Score: {score_display})")
        available_models_list.append((name, score)) # Store the original name and score
        displayed_count += 1

    if displayed_count == 0: print("No models found (after filtering).")
    # Return the list of tuples (original_name, score) for further use
    return available_models_list


def list_model_runs(model_name):
    """List all runs available for a specific model."""
    runs_data = load_json_file(RUNS_FILE)
    if not runs_data:
        print("No runs data found.")
        return []
    matching_runs = []
    for key, data in runs_data.items():
        if data.get("test_model") == model_name:
            start_time = data.get("start_time", "Unknown Time")
            status = data.get("status", "Unknown Status")
            matching_runs.append((key, start_time, status))

    if not matching_runs:
        print(f"No runs found for model: {model_name}")
        return []

    print(f"\nAvailable runs for {get_updated_model_name(model_name)}:")
    matching_runs.sort(key=lambda x: x[0]) # Sort by run key
    for idx, (key, time, status) in enumerate(matching_runs, 1):
        print(f"{idx}. {key} (Started: {time}, Status: {status})")
    return [key for key, _, _ in matching_runs]

def view_model_report(model_name, run_key=None, save_to_file=False):
    """Display the HTML report for a given model and optionally save it."""
    # Ensure results directory exists if saving
    if save_to_file:
         os.makedirs(RESULTS_DIR, exist_ok=True)
    report = generate_model_report(model_name, run_key, save_to_file)
    display(report)

def save_model_report(model_name, run_key=None):
    """Generate and save the HTML report for a given model."""
     # Ensure results directory exists
    os.makedirs(RESULTS_DIR, exist_ok=True)
    # Call generate_model_report with save_to_file=True, it handles the saving
    generate_model_report(model_name, run_key, save_to_file=True)
# -

# ## Main Execution Block

# +
if __name__ == "__main__":
    # 1. List available models
    print("--- Available Models ---")
    # list_available_models now returns a list of (name, score) tuples
    models_with_scores = list_available_models()
    # Extract just the names for iteration if needed
    models = [name for name, score in models_with_scores]
    print("-" * 24)

    # 2. Calculate and print the aggregated metrics
    #    Set save_updated_results=True to save to ELO_RESULTS_WITH_METRICS_FILE
    #    Set print_slop_profile=True to see the JS string output
    if metrics_available:
        # Pass the correct flags
        calculate_and_print_metrics(save_updated_results=True, print_slop_profile=True)
        print("-" * 24)
    else:
        print("Skipping metrics calculation as functions were not imported.")

    # 3. Example: Generate and save reports for *all* available models
    print("\nGenerating and saving HTML reports for all models...")
    if models:
        for model_name in models: # Iterate through the list of names
            # No need to check MODELS_TO_IGNORE again, list_available_models already filtered
            print(f"Processing report for: {get_updated_model_name(model_name)}")
            try:
                # Pass save_to_file=True implicitly by calling save_model_report
                save_model_report(model_name)
            except Exception as e:
                print(f"  ERROR generating report for {model_name}: {e}")
                import traceback
                traceback.print_exc() # Print stack trace for debugging
        print("\nFinished saving reports.")
    else:
        print("\nNo models found to generate reports for.")

    # 4. Example: View a report directly in IPython/Jupyter (if available)
    #    Uncomment the block below to enable direct viewing in Jupyter
    # if models and 'IPython' in sys.modules:
    #     first_model_name = models[0] # Get the name of the first model
    #     print(f"\nDisplaying report for {get_updated_model_name(first_model_name)} in IPython...")
    #     view_model_report(first_model_name) # Display the first model's report
    # else:
    #      print("\nSkipping direct display (not in IPython or no models found).")

    print("\nScript finished.")
# -

# ## Manual Report Viewing
# Use this cell to view a specific model's report after running the cells above.
# It now uses the `models_with_scores` list generated in the main block.

# + active=""
# # Ensure models_with_scores exists or re-run the main block
# if 'models_with_scores' not in locals():
#      print("Model list not found. Please run the 'Main Execution Block' cell first.")
#      models_with_scores = [] # Avoid error if list is missing
#
# # Extract just the names for checking existence
# available_model_names = [name for name, score in models_with_scores]
#
# # Example: View the report for the first model found
# if models_with_scores:
#      model_to_view, _ = models_with_scores[0] # Get name and score, use name
#      print(f"Viewing report for: {get_updated_model_name(model_to_view)}")
#      view_model_report(model_to_view)
# else:
#      print("No models available to view.")
#
# # Example: View a specific model by name
# model_name_specific = "openai/gpt-4o-mini" # Replace with the actual model name from the list
# if model_name_specific in available_model_names: # Check if it exists
#       print(f"Viewing report for: {get_updated_model_name(model_name_specific)}")
#       view_model_report(model_name_specific)
# else:
#       print(f"Model '{model_name_specific}' not found in available models.")
#
# # Example: View a specific run for a model
# model_name_for_run = "openai/gpt-4o-mini" # Replace if needed
# if model_name_for_run in available_model_names:
#     available_runs = list_model_runs(model_name_for_run)
#     if available_runs:
#         run_to_view = available_runs[0] # View the first run found
#         print(f"Viewing report for: {get_updated_model_name(model_name_for_run)}, Run: {run_to_view}")
#         view_model_report(model_name_for_run, run_key=run_to_view)
#     else:
#          print(f"No runs found for model '{model_name_for_run}' to view specific run.")
# else:
#     print(f"Model '{model_name_for_run}' not found, cannot list runs.")

# -

# ## Manual Report Saving
# Use this cell to save a report for a specific model without displaying it.

# + active=""
# # Ensure models_with_scores exists or re-run the main block
# if 'models_with_scores' not in locals():
#      print("Model list not found. Please run the 'Main Execution Block' cell first.")
#      models_with_scores = [] # Avoid error if list is missing
#
# # Extract just the names for checking existence
# available_model_names_save = [name for name, score in models_with_scores]
#
# # Example: Save the report for the first model found
# if models_with_scores:
#      model_to_save, _ = models_with_scores[0]
#      print(f"Saving report for: {get_updated_model_name(model_to_save)}")
#      save_model_report(model_to_save)
# else:
#      print("No models available to save.")
#
# # Example: Save a specific model by name
# model_name_specific_save = "openai/gpt-4o-mini" # Replace with the actual model name
# if model_name_specific_save in available_model_names_save:
#       print(f"Saving report for: {get_updated_model_name(model_name_specific_save)}")
#       save_model_report(model_name_specific_save)
# else:
#       print(f"Model '{model_name_specific_save}' not found in available models.")

# -


Successfully imported metrics functions from core.metrics
Loaded 13 prompt templates for 13 steps (5 planning, 8 chapters).
--- Available Models ---
Available models (Sorted by Overall Score):
1. gpt-4o-mini (Score: 55.2)
2. google/gemma-3-4b-it (Score: 47.3)
3. gemini-2.0-flash-001 (Score: 55.1)
4. google/gemma-3-27b-it (Score: 59.3)
5. claude-3-7-sonnet-20250219 (Score: 77.6)
6. deepseek-ai/DeepSeek-R1 (Score: 74.6)
7. chatgpt-4o-latest-2025-03-27 (Score: 76.8)
8. qwen/qwq-32b (Score: 60.8)
9. deepseek-ai/DeepSeek-V3-0324 (Score: 78.1)
10. google/gemini-2.5-pro-exp-03-25:free (Score: -inf)
11. RekaAI/reka-flash-3 (Score: 51.8)
12. gemini-2.5-pro-exp-03-25 (Score: -inf)
13. sam-paech/Darkest-muse-v1 (Score: -inf)
14. quasar-alpha (Score: 73.7)
15. gemini-2.5-pro-preview-03-25 (Score: 80.2)
16. meta-llama/Llama-4-Maverick-17B-128E-Instruct (Score: 39.7)
17. meta-llama/Llama-4-Scout-17B-16E-Instruct (Score: 35.9)
18. google/gemma-3-12b-it (Score: 51.7)
19. mistralai/Mistral-Nemo-Instruc