# Model Scoring & Evaluation

This notebook evaluates two different models using the same prompt (Prompt B) to compare their performance on Easy Language compliance.

**Goals:**
1. Load English samples.
2. Run Model A (Llama 3.1 8B) and Model B (Llama 3.3 70B) with the same prompt.
3. Evaluate both outputs against specific Easy Language rules.
4. Display side-by-side comparison with scores.

## 1. Setup & Imports

In [1]:
import os
import re
import time
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from groq import Groq
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load .env
load_dotenv(find_dotenv(usecwd=True))

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY environment variable not set.")

try:
    client = Groq(api_key=GROQ_API_KEY)
    print(f"‚úÖ Setup complete. Groq client initialized.")
except Exception as e:
    print(f"‚ùå Error initializing Groq client: {e}")

‚úÖ Setup complete. Groq client initialized.


## 2. Evaluation Helper Functions

In [2]:
def tfidf_similarity(text1: str, text2: str) -> float:
    """Calculate TF-IDF cosine similarity between two texts."""
    try:
        vectorizer = TfidfVectorizer(lowercase=True, stop_words=None)
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return round(similarity, 3)
    except:
        return 0.0

# Easy Language Rules Definition
EASY_LANGUAGE_RULES = {
    "short_sentences": {
        "name": "Short Sentences",
        "description": "Max 15% of sentences > 10 words",
        "check": lambda text: (sum(1 for s in re.split(r'[.!?\n]', text) if s.strip() and len(s.split()) > 10) / max(1, len([s for s in re.split(r'[.!?\n]', text) if s.strip()]))) * 100
    },
    "uses_bullets": {
        "name": "Uses Bullet Points",
        "description": "Uses bullet points or numbered lists for steps, lists or multiple items",
        "check": lambda text: bool(re.search(r'[‚Ä¢\-\*]\s|^\d+\.\s', text, re.MULTILINE))
    },
    "has_paragraphs": {
        "name": "Clear Paragraphs",
        "description": "Has blank lines between sections",
        "check": lambda text: '\n\n' in text or '\n \n' in text
    },
    "no_intro_text": {
        "name": "No Intro/Outro Text",
        "description": "No introductory or concluding text like 'Here is the simplified text'",
        "check": lambda text: not bool(re.match(r'^(Here\'s|Here is|This is|The following|Hier ist|In summary|To summarize)', text.strip(), re.IGNORECASE))
    },
    "no_xml_tags": {
        "name": "No XML/HTML Tags",
        "description": "Never output any XML/HTML tags or attributes (no <...>, no id=...)",
        "check": lambda text: not bool(re.search(r'<[^>]+>|id\s*=', text))
    },
    "keep_meaning": {
        "name": "Keep Meaning",
        "description": "Do not drop meaning - rewrite sentence by sentence, do not condense or join",
        "check": lambda text: True  # Manual review needed / Used with TF-IDF
    },
    "active_voice": {
        "name": "Active Voice",
        "description": "Uses active voice (approximation: few passive markers)",
        "check": lambda text: text.lower().count(' is ') + text.lower().count(' are ') + text.lower().count(' was ') + text.lower().count(' were ') < 5
    }
}

def evaluate_rules(text: str, original_text: str = None) -> dict:
    """Check how well the text follows Easy Language rules."""
    results = {}
    for rule_id, rule in EASY_LANGUAGE_RULES.items():
        
        if rule_id == "short_sentences":
            # Check percentage of long sentences (fail if > 15%)
            check_result = rule["check"](text)
            results[rule_id] = {"value": check_result, "pass": check_result <= 15}
        
        elif rule_id == "keep_meaning":
            # Use TF-IDF similarity to check meaning preservation
            if original_text:
                similarity = tfidf_similarity(original_text, text)
                # Pass if similarity >= 0.3 (threshold can be adjusted)
                results[rule_id] = {"value": similarity, "pass": similarity >= 0.3}
            else:
                results[rule_id] = {"value": "N/A", "pass": True}
        
        else:
            # For boolean checks, True = pass
            check_result = rule["check"](text)
            results[rule_id] = {"value": check_result, "pass": bool(check_result)}
    
    return results

## 3. Visualization Function

In [3]:
def display_side_by_side(original: str, results: dict, test_name: str):
    """Display model outputs side by side with rule evaluation."""
    
    n_models = len(results)
    
    # Header
    html = f"""<div style='background:#1a1a2e; padding:15px; border-radius:8px; margin:10px 0;'>
    <h3 style='color:#eee; margin:0 0 10px 0;'>üìÑ {test_name}</h3>
    <div style='background:#16213e; padding:10px; border-radius:5px; margin-bottom:15px;'>
        <strong style='color:#888;'>Original:</strong>
        <p style='color:#aaa; margin:5px 0; font-size:13px;'>{original[:300]}{'...' if len(original) > 300 else ''}</p>
    </div>
    <div style='display:flex; gap:10px;'>"""
    
    # Each model column
    for model_name, data in results.items():
        output = data.get("output", "")
        rules = data.get("rules", {})
        
        # Calculate rule score
        if rules:
            passed = sum(1 for r in rules.values() if r["pass"])
            total = len(rules)
            score_pct = (passed / total) * 100
            score_color = "#4ade80" if score_pct >= 80 else "#fbbf24" if score_pct >= 60 else "#f87171"
            score_html = f"<span style='background:{score_color}; color:#000; padding:2px 8px; border-radius:10px; font-size:12px;'>{passed}/{total} rules</span>"
        else:
            score_html = ""
        
        html += f"""
        <div style='flex:1; background:#0f3460; padding:12px; border-radius:6px;'>
            <div style='display:flex; justify-content:space-between; align-items:center; margin-bottom:10px;'>
                <strong style='color:#e0e0e0;'>{model_name}</strong>
                {score_html}
            </div>
            <div style='background:#1a1a2e; padding:10px; border-radius:4px; margin-bottom:10px; max-height:400px; overflow-y:auto;'>
                <pre style='color:#ddd; font-size:12px; white-space:pre-wrap; margin:0;'>{output}</pre>
            </div>
            <div style='font-size:11px;'>"""
        
        # Rule indicators
        for rule_id, result in rules.items():
            icon = "‚úÖ" if result["pass"] else "‚ùå"
            if rule_id in EASY_LANGUAGE_RULES:
                rule_name = EASY_LANGUAGE_RULES[rule_id]["name"]
                
                # Show value for specific rules
                if rule_id == "short_sentences":
                    value_str = f" ({result['value']:.1f}% > 10w)"
                elif rule_id == "keep_meaning" and isinstance(result['value'], float):
                    value_str = f" ({result['value']:.0%})"
                else:
                    value_str = ""
                
                html += f"<div style='color:#aaa;'>{icon} {rule_name}{value_str}</div>"
        
        html += "</div></div>"
    
    html += "</div></div>"
    display(HTML(html))

## 4. Input Data

In [4]:
try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    # Fallback if __file__ is not defined (e.g. interactive mode)
    PROJECT_ROOT = Path(os.getcwd()).parent

SAMPLES_DIR = PROJECT_ROOT / "data" / "samples"

SAMPLE_CATEGORIES_EN = {
    "en_academic.txt": "Academic",
    "en_medical.txt": "Medical",
    "en_legal.txt": "Legal",
    "en_insurance.txt": "Insurance",
    "en_technical.txt": "Technical",
    "en_government.txt": "Government",
    "en_literature.txt": "Literature",
}

def get_all_samples_en() -> list[dict]:
    samples = []
    # Sort to ensure consistent order
    for filename in sorted(SAMPLE_CATEGORIES_EN.keys()):
        filepath = SAMPLES_DIR / filename
        if filepath.exists():
            text = filepath.read_text(encoding="utf-8").strip()
            samples.append({
                "filename": filename,
                "category": SAMPLE_CATEGORIES_EN[filename],
                "text": text
            })
    return samples

samples = get_all_samples_en()
print(f"Found {len(samples)} English samples in {SAMPLES_DIR}")

Found 7 English samples in /Users/alastair/Github/klartext/data/samples


## 5. Prompt Definitions (Using Prompt B)

In [5]:
# Universal Parts
PROMPT_IDENTITY = """# Identity

You are an expert in plain language writing.
You specialise in rewriting text to be accessible 
to people with learning disabilities or low literacy.
"""

PROMPT_EXAMPLES = """# Examples
# The following are example pairs.
# Learn the style and constraints from them.
# Do NOT copy the XML tags into your output.

<examples>

  <example id="1">
    <original_text>
    Upon arrival at the facility, visitors are required to sign in at the front desk and present valid photo identification.
    </original_text>

    <simplified_text>
    When you arrive:

    * Go to the front desk.
    * Sign in with your name.
    * Show your photo ID.
    </simplified_text>
  </example>

  <example id="2">
    <original_text>
    The medication should be administered twice daily with food to minimize potential gastrointestinal discomfort.
    </original_text>

    <simplified_text>
    Take this medicine two times every day.

    * Eat food when you take it. This helps your stomach feel better.
    </simplified_text>
  </example>

</examples>
"""

# ==========================================
# PROMPT COMPONENTS (FROM PROMPT B)
# ==========================================

PROMPT_INSTRUCTIONS = """# Core Task 

* Rewrite the input text to be extremely simple and easy to understand.

# Constraints

* Keep the same meaning as the source text. Ensure that meaning is not dropped from any sentences.
* Organise the text by ideas. When an idea is complex, explain the idea using bullet points in simple language.
* You are making the text easier to understand, you are not condensing or reducing the text.
* Do NOT include any introductory or concluding text (e.g., "Here is the simplified text").
* Output ONLY the simplified text.
* Never output any XML/HTML tags or attributes (no <...>, no id=...).

# Structure & Formatting Rules

* Use clear structure.
* Use bullet points for steps, lists, or multiple items. Otherwise prefer short sentences.
* Add blank lines between every paragraph.
"""

PROMPT_RULES = """# Plain Language Rules
# Sentence & Length Rules

* Use very short sentences in the output (maximum 10 words per sentence).
* If a sentence is long, break it into multiple sentences.
* Keep subjects and verbs close together.

# Vocabulary & Wording Rules

* Use simple, familiar words. Avoid technical, foreign, or formal terms.
* Explain any uncommon or necessary technical words or abbreviations in parentheses the first time they appear.
* When a word is uncommon, explain the word in parentheses the first time they appear.
* Explain complex ideas or uncommon nouns in parentheses.
* Use positive wording. Avoid negations and never use double negatives.
* Replace abstract nouns with concrete, active verbs.

# Tone & Audience Rules

* Prefer active voice. Avoid passive voice whenever possible.
* Maintain conditional language when required.
* Address the reader personally and directly when relevant.
* Use a friendly, neutral tone.
* Avoid bureaucratic, legalistic, or commanding language.

# Consistency Rules

* Remove filler words and unnecessary details. Keep only essential information.
* Do not explain ideas using the same language.
* Use the same words consistently. Do not switch terms for the same thing.
"""

SYSTEM_PROMPT = f"""{PROMPT_IDENTITY}

{PROMPT_INSTRUCTIONS}

{PROMPT_RULES}

{PROMPT_EXAMPLES}"""

USER_TEMPLATE = "Rewrite this text in simple language:\n{{text}}"

## 6. Run & Evaluate

In [6]:
# Model Definitions
MODEL_A = "llama-3.1-8b-instant"     # Baseline Model
MODEL_B = "llama-3.3-70b-versatile"  # Experimental Model

print("Starting Evaluation Loop...")
results_history = []

for i, sample in enumerate(samples):
    print(f"Processing {i+1}/{len(samples)}: {sample['filename']} ({sample['category']})...")
    
    original_text = sample["text"]
    user_content = USER_TEMPLATE.replace("{{text}}", original_text)
    
    # --- Run Model A ---
    try:
        resp_a = client.chat.completions.create(
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_content}
            ],
            model=MODEL_A,
            temperature=0  # Deterministic
        )
        output_a = resp_a.choices[0].message.content.strip()
    except Exception as e:
        output_a = f"Error: {e}"
        
    # --- Run Model B ---
    try:
        resp_b = client.chat.completions.create(
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_content}
            ],
            model=MODEL_B,
            temperature=0  # Deterministic
        )
        output_b = resp_b.choices[0].message.content.strip()
    except Exception as e:
        output_b = f"Error: {e}"
        
    # --- Evaluate Results ---
    eval_a = evaluate_rules(output_a, original_text)
    eval_b = evaluate_rules(output_b, original_text)
    
    # --- Visualize ---
    results = {
        f"Model A ({MODEL_A})": {
            "output": output_a,
            "rules": eval_a
        },
        f"Model B ({MODEL_B})": {
            "output": output_b,
            "rules": eval_b
        }
    }
    
    results_history.append({
        "sample": sample['filename'],
        "results": results
    })
    
    display_side_by_side(original_text, results, f"{sample['filename']} ({sample['category']})")
    time.sleep(1) # Rate limit kindness


Starting Evaluation Loop...
Processing 1/7: en_academic.txt (Academic)...


Processing 2/7: en_government.txt (Government)...


Processing 3/7: en_insurance.txt (Insurance)...


Processing 4/7: en_legal.txt (Legal)...


Processing 5/7: en_literature.txt (Literature)...


Processing 6/7: en_medical.txt (Medical)...


Processing 7/7: en_technical.txt (Technical)...


In [7]:
# --- Summary Section ---
print("\n" + "="*40)
print("üèÅ FINAL SCORE SUMMARY")
print("="*40)

summary_html = """
<div style='background:#1a1a2e; padding:20px; border-radius:8px; margin-top:20px;'>
    <h2 style='color:#eee; border-bottom:1px solid #333; padding-bottom:10px;'>üèÜ Final Evaluation Summary</h2>
    <table style='width:100%; border-collapse:collapse; color:#ddd;'>
        <tr style='background:#16213e; text-align:left;'>
            <th style='padding:10px;'>Model / Prompt</th>
            <th style='padding:10px;'>Total Rules Passed</th>
            <th style='padding:10px;'>Average Meaning</th>
            <th style='padding:10px;'>Pass Rate</th>
        </tr>
"""

models_keys = [f"Model A ({MODEL_A})", f"Model B ({MODEL_B})"]

for model_key in models_keys:
    total_passed = 0
    total_rules = 0
    total_meaning = 0.0
    count_meaning = 0
    
    for item in results_history:
        if model_key in item["results"]:
            rules = item["results"][model_key]["rules"]
            total_passed += sum(1 for r in rules.values() if r["pass"])
            total_rules += len(rules)
            
            val = rules.get("keep_meaning", {}).get("value", 0)
            if isinstance(val, (int, float)):
                 total_meaning += val
                 count_meaning += 1
    
    avg_meaning_pct = (total_meaning / count_meaning * 100) if count_meaning else 0
    rule_pass_rate = (total_passed / total_rules * 100) if total_rules else 0
    
    summary_html += f"""
        <tr style='border-bottom:1px solid #333;'>
            <td style='padding:10px; font-weight:bold;'>{model_key}</td>
            <td style='padding:10px;'>{total_passed}/{total_rules}</td>
            <td style='padding:10px;'>{avg_meaning_pct:.1f}%</td>
            <td style='padding:10px;'>
                <div style='background:#333; width:100px; height:6px; border-radius:3px;'>
                    <div style='background:{'#4ade80' if rule_pass_rate >= 80 else '#fbbf24' if rule_pass_rate >= 60 else '#f87171'}; width:{rule_pass_rate}%; height:100%; border-radius:3px;'></div>
                </div>
            </td>
        </tr>
    """

summary_html += "</table></div>"
display(HTML(summary_html))


üèÅ FINAL SCORE SUMMARY


Model / Prompt,Total Rules Passed,Average Meaning,Pass Rate
Model A (llama-3.1-8b-instant),38/49,29.5%,
Model B (llama-3.3-70b-versatile),39/49,21.7%,
