In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install unsloth
!pip install selectolax


Collecting unsloth
  Downloading unsloth-2025.11.3-py3-none-any.whl.metadata (61 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/61.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.8/61.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.11.4 (from unsloth)
  Downloading unsloth_zoo-2025.11.4-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting data

**Metrics Reload**

In [None]:
import importlib
import sys

# Check if 'metrics' is already imported and reload if necessary
if 'metrics' in sys.modules:
    print('Reloading metrics module...')
    importlib.reload(sys.modules['metrics'])
else:
    print('metrics module not previously imported. No reload needed.')

# Re-import PersonaMetrics and EvaluationMetrics from the (potentially reloaded) module
from metrics import PersonaMetrics, EvaluationMetrics, aggregate_metrics


**Eval**

In [3]:
import os
import yaml
import json
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TextStreamer
from tqdm import tqdm
import logging
import time
import sys
from typing import Any, Dict, List, Tuple, Optional
from datetime import datetime

sys.path.append('/content/drive/MyDrive/Colab Notebooks/GenAI/evaluation')

# Import our custom metric calculators
from metrics import (
    PersonaMetrics,
    EvaluationMetrics,
    aggregate_metrics,
    compare_models,
    get_best_and_worst_examples
)

# --- Setup Logging ---
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler()
    ]
)
log = logging.getLogger(__name__)

# --- Helper Functions ---

def load_config(config_path: str) -> dict:
    """Loads the evaluation config file."""
    log.info(f"Loading configuration from {config_path}...")
    try:
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)
        log.info("Configuration loaded successfully.")
        return config
    except Exception as e:
        log.error(f"Failed to load config: {e}")
        raise


def setup_tokenizer(tokenizer: Any) -> Any:
    """Sets up tokenizer with proper padding token."""
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        log.info(f"Set pad_token to eos_token: {tokenizer.eos_token}")
    return tokenizer


def load_models(config: dict) -> Tuple:
    """Loads the baseline and fine-tuned models."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    log.info(f"Using device: {device}")

    max_seq_len = config['baseline_model']['max_seq_length']
    base_model_id = config['baseline_model']['id']
    tuned_model_path = config['data']['tuned_model_path']

    # 1. Load Baseline Model
    log.info(f"Loading baseline model: {base_model_id}")
    try:
        base_model, base_tokenizer = FastLanguageModel.from_pretrained(
            model_name=base_model_id,
            max_seq_length=max_seq_len,
            dtype=None,
            load_in_4bit=True,
        )
        base_tokenizer = setup_tokenizer(base_tokenizer)
        log.info("‚úì Baseline model loaded successfully.")
    except Exception as e:
        log.error(f"Failed to load baseline model: {e}")
        raise

    # 2. Load Fine-Tuned Model
    log.info(f"Loading fine-tuned model from: {tuned_model_path}")
    try:
        tuned_model, tuned_tokenizer = FastLanguageModel.from_pretrained(
            model_name=base_model_id,  # Start from base
            max_seq_length=max_seq_len,
            dtype=None,
            load_in_4bit=True,
        )

        # Apply LoRA adapters
        tuned_model.load_adapter(tuned_model_path)
        tuned_model.enable_adapters()
        tuned_tokenizer = setup_tokenizer(tuned_tokenizer)

        log.info("‚úì Fine-tuned model loaded successfully.")
    except Exception as e:
        log.error(f"Failed to load fine-tuned model: {e}")
        raise

    return (base_model, base_tokenizer), (tuned_model, tuned_tokenizer), device


def load_test_data(config: dict) -> Any:
    """Loads and prepares the test dataset."""
    test_file = config['data']['test_file']
    log.info(f"Loading test data from: {test_file}")

    try:
        dataset = load_dataset("json", data_files=test_file, split="train")

        limit = config['evaluation'].get('limit_samples', 0)
        if limit > 0:
            log.warning(f"‚ö†Ô∏è  Limiting evaluation to {limit} samples for testing.")
            dataset = dataset.select(range(min(limit, len(dataset))))

        log.info(f"‚úì Loaded {len(dataset)} test samples.")
        return dataset

    except Exception as e:
        log.error(f"Failed to load test data: {e}")
        raise


def create_prompt_from_record(record: dict, tokenizer: Any) -> Optional[str]:
    """
    Converts a data record into a formatted prompt string for inference.
    Must match the format used in data_loader.py from Task 2.
    """
    try:
        messages = []

        # 1. System Prompt
        persona = "\n".join(record.get("persona", ["I am a standard NPC."]))
        world_facts = "\n".join(record.get("world_facts", ["No specific context."]))

        system_prompt = f"""You are a humanized video game NPC. You must speak naturally, stay in character, and respond in valid JSON format.

<Persona>
{persona}
</Persona>

<WorldFacts>
{world_facts}
</WorldFacts>

<Rules>
- Respond with a valid JSON object: {{"utterance": "...", "mood": "..."}}
- Your "utterance" must be conversational and in-character.
- Keep responses short and natural.
</Rules>
"""
        messages.append({"role": "system", "content": system_prompt})

        # 2. Player Query (first dialog turn)
        if record.get("dialog") and len(record["dialog"]) > 0:
            player_query = record["dialog"][0].get("text", "Hello.")
        else:
            player_query = "Hello."

        # Do NOT add "Player: " prefix - the template handles this
        messages.append({"role": "user", "content": player_query})

        # 3. Apply chat template
        formatted_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        return formatted_prompt

    except Exception as e:
        log.error(f"Failed to create prompt for record {record.get('id')}: {e}")
        return None


@torch.no_grad()
def generate_response(
    model: Any,
    tokenizer: Any,
    prompt: str,
    gen_config: dict
) -> str:
    """
    Generates a single response from a model.
    """
    try:
        # Calculate safe max length
        max_input_length = model.config.max_position_embeddings - gen_config['max_new_tokens']

        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=max_input_length
        ).to(model.device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=gen_config['max_new_tokens'],
            temperature=gen_config['temperature'],
            top_p=gen_config['top_p'],
            do_sample=gen_config['do_sample'],
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

        # Decode only newly generated tokens
        response_text = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        )

        # Clean up artifacts
        response_text = response_text.strip().replace("<|im_end|>", "").strip()

        return response_text

    except Exception as e:
        log.error(f"Generation failed: {e}")
        return ""


def save_detailed_results(
    results: List[Dict[str, Any]],
    output_file: str
) -> None:
    """Saves detailed generation results to JSONL file."""
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    try:
        with open(output_file, "w", encoding="utf-8") as f:
            for result in results:
                f.write(json.dumps(result) + "\n")
        log.info(f"‚úì Detailed results saved to: {output_file}")
    except Exception as e:
        log.error(f"Failed to save detailed results: {e}")


def save_report(
    report: Dict[str, Any],
    output_file: str
) -> None:
    """Saves final aggregated report to JSON file."""
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    try:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(report, f, indent=4)
        log.info(f"‚úì Final report saved to: {output_file}")
    except Exception as e:
        log.error(f"Failed to save report: {e}")


def print_summary_table(baseline_report: dict, tuned_report: dict, comparison: dict) -> None:
    """Prints a formatted summary table to console."""

    print("\n" + "="*60)
    print("           EVALUATION REPORT SUMMARY")
    print("="*60)
    print(f"{'Metric':<30} {'Baseline':<15} {'Fine-Tuned':<15}")
    print("-"*60)

    def fmt_pct(val):
        if val is None:
            return "N/A"
        return f"{val*100:.2f}%"

    def fmt_float(val):
        if val is None:
            return "N/A"
        return f"{val:.4f}"

    # Schema & Constraints
    print(f"{'Schema Validity':<30} {fmt_pct(baseline_report.get('avg_is_valid_schema')):<15} {fmt_pct(tuned_report.get('avg_is_valid_schema')):<15}")
    print(f"{'Brief (<60 tokens)':<30} {fmt_pct(baseline_report.get('avg_is_brief')):<15} {fmt_pct(tuned_report.get('avg_is_brief')):<15}")
    print(f"{'Clean (no banlist)':<30} {fmt_pct(baseline_report.get('avg_is_clean')):<15} {fmt_pct(tuned_report.get('avg_is_clean')):<15}")

    print("-"*60)

    # Persona Faithfulness
    print(f"{'Persona Contradiction':<30} {fmt_pct(baseline_report.get('avg_persona_contradiction')):<15} {fmt_pct(tuned_report.get('avg_persona_contradiction')):<15}")
    print(f"{'Persona Similarity (max)':<30} {fmt_float(baseline_report.get('avg_persona_similarity_max')):<15} {fmt_float(tuned_report.get('avg_persona_similarity_max')):<15}")

    print("-"*60)

    # Hallucination
    print(f"{'UCR (Hallucination)':<30} {fmt_pct(baseline_report.get('avg_ucr')):<15} {fmt_pct(tuned_report.get('avg_ucr')):<15}")
    print(f"{'NEP (Grounding Precision)':<30} {fmt_pct(baseline_report.get('avg_nep')):<15} {fmt_pct(tuned_report.get('avg_nep')):<15}")

    print("-"*60)

    # Diversity
    print(f"{'Distinct-1 (Diversity)':<30} {fmt_float(baseline_report.get('diversity_distinct_1')):<15} {fmt_float(tuned_report.get('diversity_distinct_1')):<15}")
    print(f"{'Distinct-2 (Diversity)':<30} {fmt_float(baseline_report.get('diversity_distinct_2')):<15} {fmt_float(tuned_report.get('diversity_distinct_2')):<15}")
    print(f"{'Entropy':<30} {fmt_float(baseline_report.get('diversity_entropy')):<15} {fmt_float(tuned_report.get('diversity_entropy')):<15}")

    print("="*60)
    print()


def generate_examples_report(
    baseline_results: List[Dict],
    tuned_results: List[Dict],
    output_file: str,
    n: int = 3
) -> None:
    """Generates a report with best and worst examples."""

    baseline_best = get_best_and_worst_examples(
        baseline_results,
        metric_key="persona_similarity_max",
        n=n
    )

    tuned_best = get_best_and_worst_examples(
        tuned_results,
        metric_key="persona_similarity_max",
        n=n
    )

    examples_report = {
        "baseline_best_examples": baseline_best["best"],
        "baseline_worst_examples": baseline_best["worst"],
        "tuned_best_examples": tuned_best["best"],
        "tuned_worst_examples": tuned_best["worst"]
    }

    try:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(examples_report, f, indent=4)
        log.info(f"‚úì Examples report saved to: {output_file}")
    except Exception as e:
        log.error(f"Failed to save examples report: {e}")


def run_evaluation(config: dict) -> None:
    """Main evaluation pipeline."""

    start_time_total = time.time()

    # --- 1. Load Models and Data ---
    log.info("="*60)
    log.info("STEP 1: Loading Models and Data")
    log.info("="*60)

    (base_model, base_tokenizer), (tuned_model, tuned_tokenizer), device = load_models(config)
    test_dataset = load_test_data(config)

    gen_config = config['evaluation']['generation_config']

    # --- 2. Initialize Metric Computers ---
    log.info("="*60)
    log.info("STEP 2: Initializing Metric Models")
    log.info("="*60)

    persona_computer = PersonaMetrics(
        nli_model_name=config['evaluation']['metrics']['nli_model'],
        embedding_model_name=config['evaluation']['metrics']['embedding_model'],
        device=device
    )

    metrics_computer = EvaluationMetrics(
        persona_metrics_computer=persona_computer,
        sim_threshold=config['evaluation']['metrics']['persona_similarity_threshold'],
        store_detailed_results=config['evaluation'].get('store_detailed_results', True)
    )

    # --- 3. Run Generation Loop ---
    log.info("="*60)
    log.info("STEP 3: Running Generation and Evaluation")
    log.info("="*60)

    baseline_results = []
    tuned_results = []
    generation_records = []

    failed_prompts = []

    start_time_gen = time.time()

    for idx, item in enumerate(tqdm(test_dataset, desc="Evaluating Test Set")):

        # Create prompts
        base_prompt = create_prompt_from_record(item, base_tokenizer)
        tuned_prompt = create_prompt_from_record(item, tuned_tokenizer)

        if not base_prompt or not tuned_prompt:
            failed_prompts.append({
                "index": idx,
                "record_id": item.get("id"),
                "reason": "Prompt creation failed"
            })
            log.warning(f"‚ö†Ô∏è  Skipping record {item.get('id')} - prompt creation failed")
            continue

        # Generate responses
        base_response = generate_response(base_model, base_tokenizer, base_prompt, gen_config)
        tuned_response = generate_response(tuned_model, tuned_tokenizer, tuned_prompt, gen_config)

        # Compute metrics
        base_metrics = metrics_computer.compute_all(item, base_response)
        tuned_metrics = metrics_computer.compute_all(item, tuned_response)

        baseline_results.append(base_metrics)
        tuned_results.append(tuned_metrics)

        # Store combined record
        generation_record = {
            "record_id": item.get("id"),
            "source_dataset": item.get("source"),
            "persona": item.get("persona"),
            "player_query": item.get("dialog")[0].get("text") if item.get("dialog") else "",
            "baseline": {
                "response": base_response,
                "metrics": base_metrics
            },
            "tuned": {
                "response": tuned_response,
                "metrics": tuned_metrics
            }
        }
        generation_records.append(generation_record)

    end_time_gen = time.time()
    gen_time = end_time_gen - start_time_gen
    samples_per_sec = len(generation_records) / gen_time if gen_time > 0 else 0

    log.info(f"‚úì Generation complete: {len(generation_records)} samples in {gen_time:.2f}s ({samples_per_sec:.2f} samples/sec)")

    if failed_prompts:
        log.warning(f"‚ö†Ô∏è  {len(failed_prompts)} prompts failed to generate")

    # --- 4. Aggregate Metrics ---
    log.info("="*60)
    log.info("STEP 4: Aggregating Metrics")
    log.info("="*60)

    baseline_report = aggregate_metrics(baseline_results)
    tuned_report = aggregate_metrics(tuned_results)

    # --- 5. Statistical Comparison ---
    log.info("="*60)
    log.info("STEP 5: Statistical Comparison")
    log.info("="*60)

    comparison = compare_models(baseline_results, tuned_results)

    # --- 6. Create Final Report ---
    end_time_total = time.time()
    total_time = end_time_total - start_time_total

    final_report = {
        "metadata": {
            "evaluation_date": datetime.now().isoformat(),
            "evaluation_samples": len(generation_records),
            "failed_prompts": len(failed_prompts),
            "total_time_seconds": total_time,
            "generation_time_seconds": gen_time,
            "samples_per_second": samples_per_sec,
            "tuned_model_path": config['data']['tuned_model_path'],
            "baseline_model_id": config['baseline_model']['id'],
            "config": config
        },
        "baseline_model_metrics": baseline_report,
        "tuned_model_metrics": tuned_report,
        "statistical_comparison": comparison,
        "failed_records": failed_prompts
    }

    # --- 7. Save Results ---
    log.info("="*60)
    log.info("STEP 6: Saving Results")
    log.info("="*60)

    # Save detailed generations
    output_gen_file = config['outputs']['generation_file']
    save_detailed_results(generation_records, output_gen_file)

    # Save final report
    output_report_file = config['outputs']['report_file']
    save_report(final_report, output_report_file)

    # Save examples report
    examples_file = config['outputs'].get('examples_file',
        output_report_file.replace('.json', '_examples.json'))
    generate_examples_report(baseline_results, tuned_results, examples_file)

    # --- 8. Print Summary ---
    log.info("="*60)
    log.info("EVALUATION COMPLETE")
    log.info("="*60)

    print_summary_table(baseline_report, tuned_report, comparison)

    log.info(f"üìÅ Files saved:")
    log.info(f"   - Detailed generations: {output_gen_file}")
    log.info(f"   - Final report: {output_report_file}")
    log.info(f"   - Examples: {examples_file}")

    log.info(f"‚úì‚úì‚úì All done! Total time: {total_time:.2f}s ‚úì‚úì‚úì")

if __name__ == "__main__":
    # Load config and run evaluation
    config_path = "/content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/eval.yaml"

    try:
        config = load_config(config_path)
        run_evaluation(config)
    except Exception as e:
        log.error(f"‚ùå Evaluation failed: {e}")
        raise

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Generating train split: 0 examples [00:00, ? examples/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Device set to use cuda:0


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Evaluating Test Set:   1%|          | 6/1000 [00:30<1:10:08,  4.23s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating Test Set: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [1:12:08<00:00,  4.33s/it]



           EVALUATION REPORT SUMMARY
Metric                         Baseline        Fine-Tuned     
------------------------------------------------------------
Schema Validity                99.90%          75.60%         
Brief (<60 tokens)             99.90%          75.50%         
Clean (no banlist)             99.90%          75.60%         
------------------------------------------------------------
Persona Contradiction          0.80%           2.78%          
Persona Similarity (max)       0.2394          0.1897         
------------------------------------------------------------
UCR (Hallucination)            61.13%          58.99%         
NEP (Grounding Precision)      38.87%          41.01%         
------------------------------------------------------------
Distinct-1 (Diversity)         0.1583          0.1671         
Distinct-2 (Diversity)         0.5819          0.6015         
Entropy                        8.6801          8.3547         



**Visualization**

In [4]:
"""
Visualization Script for Evaluation Results
Generates charts and plots for model comparison.
"""

import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from typing import Dict, List, Any
import pandas as pd
from collections import Counter

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10


def load_report(report_path: str) -> Dict[str, Any]:
    """Load the final evaluation report."""
    with open(report_path, 'r') as f:
        return json.load(f)


def load_generations(gen_path: str) -> List[Dict[str, Any]]:
    """Load detailed generation results."""
    results = []
    with open(gen_path, 'r') as f:
        for line in f:
            results.append(json.loads(line))
    return results


def plot_metric_comparison(
    report: Dict[str, Any],
    output_dir: str
) -> None:
    """Create bar chart comparing baseline vs fine-tuned metrics."""

    baseline = report['baseline_model_metrics']
    tuned = report['tuned_model_metrics']
    comparison = report['statistical_comparison']

    metrics_to_plot = {
        'Schema Validity': ('avg_is_valid_schema', 'higher_better'),
        'Persona Similarity': ('avg_persona_similarity_max', 'higher_better'),
        'Hallucination (UCR)': ('avg_ucr', 'lower_better'),
        'Grounding (NEP)': ('avg_nep', 'higher_better'),
        'Distinct-2': ('diversity_distinct_2', 'higher_better'),
    }

    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()

    for idx, (display_name, (metric_key, direction)) in enumerate(metrics_to_plot.items()):
        ax = axes[idx]

        baseline_val = baseline.get(metric_key, 0)
        tuned_val = tuned.get(metric_key, 0)

        # Get statistical significance
        is_significant = False
        if metric_key.replace('avg_', '').replace('diversity_', '') in comparison:
            comp_key = metric_key.replace('avg_', '').replace('diversity_', '')
            is_significant = comparison[comp_key].get('is_significant', False)

        x = ['Baseline', 'Fine-Tuned']
        y = [baseline_val, tuned_val]

        colors = ['#ff7f0e', '#2ca02c']
        bars = ax.bar(x, y, color=colors, alpha=0.7, edgecolor='black')

        # Add significance marker
        if is_significant:
            max_y = max(y)
            ax.text(0.5, max_y * 1.05, '***', ha='center', va='bottom',
                   fontsize=16, fontweight='bold')

        # Add value labels
        for i, (bar, val) in enumerate(zip(bars, y)):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{val:.3f}',
                   ha='center', va='bottom', fontsize=9)

        ax.set_ylabel('Score')
        ax.set_title(display_name, fontweight='bold')
        ax.set_ylim(0, max(y) * 1.15)

    # Remove extra subplot
    fig.delaxes(axes[-1])

    plt.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold', y=1.00)
    plt.tight_layout()

    output_path = os.path.join(output_dir, 'metric_comparison.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"‚úì Saved metric comparison: {output_path}")
    plt.close()


def plot_metric_distributions(
    generations: List[Dict[str, Any]],
    output_dir: str
) -> None:
    """Plot distributions of key metrics."""

    # Extract metrics
    baseline_sim = [g['baseline']['metrics'].get('persona_similarity_max')
                    for g in generations
                    if g['baseline']['metrics'].get('persona_similarity_max') is not None]

    tuned_sim = [g['tuned']['metrics'].get('persona_similarity_max')
                 for g in generations
                 if g['tuned']['metrics'].get('persona_similarity_max') is not None]

    baseline_ucr = [g['baseline']['metrics'].get('ucr')
                    for g in generations
                    if g['baseline']['metrics'].get('ucr') is not None]

    tuned_ucr = [g['tuned']['metrics'].get('ucr')
                 for g in generations
                 if g['tuned']['metrics'].get('ucr') is not None]

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Persona Similarity Distribution
    ax = axes[0]
    ax.hist(baseline_sim, bins=30, alpha=0.5, label='Baseline', color='#ff7f0e', edgecolor='black')
    ax.hist(tuned_sim, bins=30, alpha=0.5, label='Fine-Tuned', color='#2ca02c', edgecolor='black')
    ax.axvline(np.mean(baseline_sim), color='#ff7f0e', linestyle='--', linewidth=2, label=f'Baseline Mean: {np.mean(baseline_sim):.3f}')
    ax.axvline(np.mean(tuned_sim), color='#2ca02c', linestyle='--', linewidth=2, label=f'Tuned Mean: {np.mean(tuned_sim):.3f}')
    ax.set_xlabel('Persona Similarity Score')
    ax.set_ylabel('Frequency')
    ax.set_title('Distribution of Persona Similarity', fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Hallucination (UCR) Distribution
    ax = axes[1]
    ax.hist(baseline_ucr, bins=30, alpha=0.5, label='Baseline', color='#ff7f0e', edgecolor='black')
    ax.hist(tuned_ucr, bins=30, alpha=0.5, label='Fine-Tuned', color='#2ca02c', edgecolor='black')
    ax.axvline(np.mean(baseline_ucr), color='#ff7f0e', linestyle='--', linewidth=2, label=f'Baseline Mean: {np.mean(baseline_ucr):.3f}')
    ax.axvline(np.mean(tuned_ucr), color='#2ca02c', linestyle='--', linewidth=2, label=f'Tuned Mean: {np.mean(tuned_ucr):.3f}')
    ax.set_xlabel('UCR (Hallucination Rate)')
    ax.set_ylabel('Frequency')
    ax.set_title('Distribution of Hallucination Rate', fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    output_path = os.path.join(output_dir, 'metric_distributions.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"‚úì Saved metric distributions: {output_path}")
    plt.close()


def plot_mood_distribution(
    generations: List[Dict[str, Any]],
    output_dir: str
) -> None:
    """Plot mood distribution comparison."""

    baseline_moods = [g['baseline']['metrics'].get('mood')
                      for g in generations
                      if g['baseline']['metrics'].get('mood')]

    tuned_moods = [g['tuned']['metrics'].get('mood')
                   for g in generations
                   if g['tuned']['metrics'].get('mood')]

    baseline_counts = Counter(baseline_moods)
    tuned_counts = Counter(tuned_moods)

    # Get all unique moods
    all_moods = sorted(set(list(baseline_counts.keys()) + list(tuned_counts.keys())))

    baseline_vals = [baseline_counts.get(mood, 0) for mood in all_moods]
    tuned_vals = [tuned_counts.get(mood, 0) for mood in all_moods]

    x = np.arange(len(all_moods))
    width = 0.35

    fig, ax = plt.subplots(figsize=(14, 6))

    bars1 = ax.bar(x - width/2, baseline_vals, width, label='Baseline',
                   color='#ff7f0e', alpha=0.7, edgecolor='black')
    bars2 = ax.bar(x + width/2, tuned_vals, width, label='Fine-Tuned',
                   color='#2ca02c', alpha=0.7, edgecolor='black')

    ax.set_xlabel('Mood')
    ax.set_ylabel('Frequency')
    ax.set_title('Mood Distribution Comparison', fontweight='bold', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(all_moods, rotation=45, ha='right')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')

    plt.tight_layout()
    output_path = os.path.join(output_dir, 'mood_distribution.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"‚úì Saved mood distribution: {output_path}")
    plt.close()


def plot_per_source_performance(
    generations: List[Dict[str, Any]],
    output_dir: str
) -> None:
    """Plot performance breakdown by source dataset."""

    # Group by source
    sources = {}
    for gen in generations:
        source = gen.get('source_dataset', 'unknown')
        if source not in sources:
            sources[source] = {'baseline_sim': [], 'tuned_sim': []}

        base_sim = gen['baseline']['metrics'].get('persona_similarity_max')
        tuned_sim = gen['tuned']['metrics'].get('persona_similarity_max')

        if base_sim is not None:
            sources[source]['baseline_sim'].append(base_sim)
        if tuned_sim is not None:
            sources[source]['tuned_sim'].append(tuned_sim)

    # Calculate means
    source_names = list(sources.keys())
    baseline_means = [np.mean(sources[s]['baseline_sim']) if sources[s]['baseline_sim'] else 0
                      for s in source_names]
    tuned_means = [np.mean(sources[s]['tuned_sim']) if sources[s]['tuned_sim'] else 0
                   for s in source_names]

    x = np.arange(len(source_names))
    width = 0.35

    fig, ax = plt.subplots(figsize=(12, 6))

    bars1 = ax.bar(x - width/2, baseline_means, width, label='Baseline',
                   color='#ff7f0e', alpha=0.7, edgecolor='black')
    bars2 = ax.bar(x + width/2, tuned_means, width, label='Fine-Tuned',
                   color='#2ca02c', alpha=0.7, edgecolor='black')

    # Add value labels
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}',
                   ha='center', va='bottom', fontsize=8)

    ax.set_xlabel('Source Dataset')
    ax.set_ylabel('Mean Persona Similarity')
    ax.set_title('Performance by Source Dataset', fontweight='bold', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(source_names, rotation=45, ha='right')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')

    plt.tight_layout()
    output_path = os.path.join(output_dir, 'performance_by_source.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"‚úì Saved per-source performance: {output_path}")
    plt.close()


def plot_token_length_analysis(
    generations: List[Dict[str, Any]],
    output_dir: str
) -> None:
    """Analyze and plot token length distributions."""

    baseline_lengths = [g['baseline']['metrics'].get('token_count', 0)
                        for g in generations
                        if g['baseline']['metrics'].get('token_count')]

    tuned_lengths = [g['tuned']['metrics'].get('token_count', 0)
                     for g in generations
                     if g['tuned']['metrics'].get('token_count')]

    fig, ax = plt.subplots(figsize=(10, 6))

    ax.hist(baseline_lengths, bins=30, alpha=0.5, label='Baseline',
            color='#ff7f0e', edgecolor='black')
    ax.hist(tuned_lengths, bins=30, alpha=0.5, label='Fine-Tuned',
            color='#2ca02c', edgecolor='black')

    ax.axvline(60, color='red', linestyle='--', linewidth=2, label='Target Limit (60 tokens)')
    ax.axvline(np.mean(baseline_lengths), color='#ff7f0e', linestyle=':', linewidth=2)
    ax.axvline(np.mean(tuned_lengths), color='#2ca02c', linestyle=':', linewidth=2)

    ax.set_xlabel('Token Count')
    ax.set_ylabel('Frequency')
    ax.set_title('Response Length Distribution', fontweight='bold', fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Add statistics text box
    stats_text = f'Baseline: Œº={np.mean(baseline_lengths):.1f}, œÉ={np.std(baseline_lengths):.1f}\n'
    stats_text += f'Fine-Tuned: Œº={np.mean(tuned_lengths):.1f}, œÉ={np.std(tuned_lengths):.1f}'
    ax.text(0.95, 0.95, stats_text, transform=ax.transAxes,
            fontsize=10, verticalalignment='top', horizontalalignment='right',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

    plt.tight_layout()
    output_path = os.path.join(output_dir, 'token_length_distribution.png')
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    print(f"‚úì Saved token length analysis: {output_path}")
    plt.close()


def create_summary_report(
    report: Dict[str, Any],
    output_dir: str
) -> None:
    """Create a text summary report."""

    output_path = os.path.join(output_dir, 'summary_report.txt')

    with open(output_path, 'w') as f:
        f.write("="*80 + "\n")
        f.write("HUMANIZED NPC-LLM EVALUATION SUMMARY\n")
        f.write("="*80 + "\n\n")

        # Metadata
        metadata = report['metadata']
        f.write("EVALUATION METADATA\n")
        f.write("-"*80 + "\n")
        f.write(f"Date: {metadata['evaluation_date']}\n")
        f.write(f"Samples Evaluated: {metadata['evaluation_samples']}\n")
        f.write(f"Baseline Model: {metadata['baseline_model_id']}\n")
        f.write(f"Fine-Tuned Model: {metadata['tuned_model_path']}\n")
        f.write(f"Total Time: {metadata['total_time_seconds']:.2f}s\n")
        f.write(f"Throughput: {metadata['samples_per_second']:.2f} samples/sec\n")
        f.write("\n")

        # Key Findings
        comparison = report['statistical_comparison']
        f.write("KEY FINDINGS\n")
        f.write("-"*80 + "\n")

        for metric, stats in comparison.items():
            f.write(f"\n{metric.upper().replace('_', ' ')}:\n")
            f.write(f"  Baseline: {stats['baseline_mean']:.4f} (¬±{stats['baseline_std']:.4f})\n")
            f.write(f"  Fine-Tuned: {stats['tuned_mean']:.4f} (¬±{stats['tuned_std']:.4f})\n")
            f.write(f"  Improvement: {stats['improvement']:+.4f} ({stats['improvement_pct']:+.2f}%)\n")
            f.write(f"  P-value: {stats['p_value']:.4f}\n")
            f.write(f"  Effect Size: {stats['effect_size']} (Cohen's d = {stats['cohens_d']:.3f})\n")
            f.write(f"  Significant: {'YES ***' if stats['is_significant'] else 'NO'}\n")

        f.write("\n" + "="*80 + "\n")

    print(f"‚úì Saved summary report: {output_path}")


def visualize_all(
    report_path: str,
    generations_path: str,
    output_dir: str
) -> None:
    """Generate all visualizations."""

    print("="*60)
    print("GENERATING VISUALIZATIONS")
    print("="*60)

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Load data
    print("Loading data...")
    report = load_report(report_path)
    generations = load_generations(generations_path)
    print(f"‚úì Loaded report and {len(generations)} generation records\n")

    # Generate plots
    print("Creating visualizations...")
    plot_metric_comparison(report, output_dir)
    plot_metric_distributions(generations, output_dir)
    plot_mood_distribution(generations, output_dir)
    plot_per_source_performance(generations, output_dir)
    plot_token_length_analysis(generations, output_dir)
    create_summary_report(report, output_dir)

    print("\n" + "="*60)
    print("‚úì ALL VISUALIZATIONS COMPLETE")
    print(f"‚úì Files saved to: {output_dir}")
    print("="*60)


if __name__ == "__main__":
    import sys

    # Default paths (update these to match your eval.yaml)
    report_path = "/content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/final_report.json"
    generations_path = "/content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/generations.jsonl"
    output_dir = "/content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/visualizations"

    # Allow command line overrides
    if len(sys.argv) >= 4:
        report_path = sys.argv[1]
        generations_path = sys.argv[2]
        output_dir = sys.argv[3]

    visualize_all(report_path, generations_path, output_dir)

GENERATING VISUALIZATIONS
Loading data...
‚úì Loaded report and 1000 generation records

Creating visualizations...
‚úì Saved metric comparison: /content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/visualizations/metric_comparison.png
‚úì Saved metric distributions: /content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/visualizations/metric_distributions.png
‚úì Saved mood distribution: /content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/visualizations/mood_distribution.png
‚úì Saved per-source performance: /content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/visualizations/performance_by_source.png
‚úì Saved token length analysis: /content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/visualizations/token_length_distribution.png
‚úì Saved summary report: /content/drive/MyDrive/Colab Notebooks/GenAI/evaluation/outputs/results/visualizations/summary_report.txt

‚úì ALL VISUALIZATIONS COMPLETE
‚ú