# Final Prompt Evaluation - English

Measurement harness for English prompt iterations.
Compares Prompt A (Baseline) vs Prompt B (Experimental).

In [38]:
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from groq import Groq
from pybars import Compiler

# Load .env
load_dotenv(find_dotenv(usecwd=True))

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY environment variable not set.")

GROQ_MODEL = "llama-3.3-70b-versatile"
try:
    client = Groq(api_key=GROQ_API_KEY)
    print(f"‚úÖ Setup complete. Using model: {GROQ_MODEL}")
except Exception as e:
    print(f"‚ùå Error initializing Groq client: {e}")

‚úÖ Setup complete. Using model: llama-3.3-70b-versatile


## Paths & Data Loading

In [39]:
try:
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
except NameError:
    # Fallback if __file__ is not defined (e.g. interactive mode)
    PROJECT_ROOT = Path(os.getcwd()).parent

SAMPLES_DIR = PROJECT_ROOT / "data" / "samples"

SAMPLE_CATEGORIES_EN = {
    "en_academic.txt": "Academic",
    "en_medical.txt": "Medical",
    "en_legal.txt": "Legal",
    "en_insurance.txt": "Insurance",
    "en_technical.txt": "Technical",
    "en_government.txt": "Government",
}

def get_all_samples_en() -> list[dict]:
    samples = []
    for filename in sorted(SAMPLE_CATEGORIES_EN.keys()):
        filepath = SAMPLES_DIR / filename
        if filepath.exists():
            text = filepath.read_text(encoding="utf-8").strip()
            samples.append({
                "filename": filename,
                "category": SAMPLE_CATEGORIES_EN[filename],
                "text": text
            })
    return samples

print(f"Found {len(get_all_samples_en())} English samples in {SAMPLES_DIR}")

Found 6 English samples in /Users/alastair/Github/klartext/data/samples


## Prompts (English)

In [40]:
PROMPT_IDENTITY = """# Identity

You are an expert in plain language writing.
You specialise in rewriting text to be accessible 
to people with learning disabilities or low literacy.
"""

PROMPT_INSTRUCTIONS = """# Core Task 

* Rewrite the input text to be extremely simple and easy to understand.
* Keep the same meaning as the original text.

# Constraints

* Do NOT include any introductory or concluding text (e.g., "Here is the simplified text").
* Output ONLY the simplified text.
* Never output any XML/HTML tags or attributes (no <...>, no id=...).

# Structure & Formatting Rules

* Use clear structure.
* Use bullet points for steps, lists, or multiple items. Otherwise prefer short sentences.
* Add blank lines between every paragraph.
"""

EL_RULES = """# Plain Language Rules
# Sentence & Length Rules

* Use very short sentences (maximum 10 words per sentence).
* Break up long sentences.
* Keep subjects and verbs close together.

# Vocabulary & Wording Rules

* Use simple, familiar words. Avoid technical, foreign, or formal terms.
* Explain any uncommon or necessary technical words or abbreviations in parentheses the first time they appear.
* Explain complex ideas or uncommon nouns in parentheses.
* Use positive wording. Avoid negations and never use double negatives.
* Replace abstract nouns with concrete, active verbs.

# Tone & Audience Rules

* Prefer active voice. Avoid passive voice whenever possible.
* Address the reader directly using ‚Äúyou‚Äù.
* Use a friendly, neutral tone.
* Avoid bureaucratic, legalistic, or commanding language.

# Consistency Rules

* Remove filler words and unnecessary details. Keep only essential information.
* Use the same words consistently. Do not switch terms for the same thing.
"""

PROMPT_EXAMPLES = """# Examples
# The following are example pairs.
# Learn the style and constraints from them.
# Do NOT copy the XML tags into your output.

<examples>

  <example id="1">
    <original_text>
    Upon arrival at the facility, visitors are required to sign in at the front desk and present valid photo identification.
    </original_text>

    <simplified_text>
    When you arrive:

    * Go to the front desk.
    * Sign in with your name.
    * Show your photo ID.
    </simplified_text>
  </example>

  <example id="2">
    <original_text>
    The medication should be administered twice daily with food to minimize potential gastrointestinal discomfort.
    </original_text>

    <simplified_text>
    Take this medicine two times every day.

    Eat food when you take it. This helps your stomach feel better.
    </simplified_text>
  </example>

</examples>
"""

# Assemble Prompt A (Baseline)
PROMPT_A_SYSTEM = f"""{PROMPT_IDENTITY}

{PROMPT_INSTRUCTIONS}

{EL_RULES}

{PROMPT_EXAMPLES}"""

PROMPT_USER_TEMPLATE = "Rewrite this text in simple language:\n{{text}}"

# Assemble Prompt B (Experimental)
# ------------------------------------------------------------
# To experiment, redefine any component below before assembly.
# ------------------------------------------------------------

PROMPT_B_IDENTITY = PROMPT_IDENTITY

PROMPT_B_INSTRUCTIONS = """# Core Task 

* Rewrite the input text to be extremely simple and easy to understand.

# Constraints

* Do not drop meaning. Try to rewrite the text sentence by sentence to ensure meaning is not dropped.
* Keep the same meaning as the source text - you are making the text easier to understand, not condensing or joining sentences.
* Do NOT include any introductory or concluding text (e.g., "Here is the simplified text").
* Output ONLY the simplified text.
* Never output any XML/HTML tags or attributes (no <...>, no id=...).

# Structure & Formatting Rules

* Use clear structure.
* Use bullet points for steps, lists, or multiple items. Otherwise prefer short sentences.
* Add blank lines between every paragraph.
"""

PROMPT_B_RULES = """# Plain Language Rules
# Sentence & Length Rules

* Use very short sentences (maximum 10 words per sentence).
* Break up long sentences.
* Keep subjects and verbs close together.

# Vocabulary & Wording Rules

* Use simple, familiar words. Avoid technical, foreign, or formal terms.
* Explain any uncommon or necessary technical words or abbreviations in parentheses the first time they appear.
* Explain complex ideas or uncommon nouns in parentheses.
* Use positive wording. Avoid negations and never use double negatives.
* Replace abstract nouns with concrete, active verbs.

# Tone & Audience Rules

* Prefer active voice. Avoid passive voice whenever possible.
* Address the reader personally and directly when relevant.
* Use a friendly, neutral tone.
* Avoid bureaucratic, legalistic, or commanding language.

# Consistency Rules

* Remove filler words and unnecessary details. Keep only essential information.
* Use the same words consistently. Do not switch terms for the same thing.
"""

PROMPT_B_EXAMPLES = PROMPT_EXAMPLES

PROMPT_B_SYSTEM = f"""{PROMPT_B_IDENTITY}

{PROMPT_B_INSTRUCTIONS}

{PROMPT_B_RULES}

{PROMPT_B_EXAMPLES}"""

PROMPT_B_USER_TEMPLATE = PROMPT_USER_TEMPLATE

## Inference Functions

In [41]:
def render_prompt(template_source: str, text: str) -> str:
    compiler = Compiler()
    template = compiler.compile(template_source)
    return template({"text": text})

def call_groq(system_prompt: str, user_prompt: str, api_key: str, model: str = GROQ_MODEL, temperature: float = 0.1, max_tokens: int = 2000) -> str:
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content

def run_prompt_on_samples_en(system_prompt: str, user_template: str, api_key: str, samples: list[dict] = None) -> pd.DataFrame:
    if samples is None:
        samples = get_all_samples_en()
    
    results = []
    for s in samples:
        try:
            # System prompt is usually static, user prompt needs rendering
            # But we support handlebars in system prompt too if needed, though not used in Baseline A
            rendered_user = render_prompt(user_template, s["text"])
            
            output = call_groq(system_prompt, rendered_user, api_key)
            results.append({
                "filename": s["filename"],
                "category": s["category"],
                "original_text": s["text"],
                "output": output
            })
        except Exception as e:
             results.append({
                "filename": s["filename"],
                "category": s["category"],
                "original_text": s["text"],
                "output": f"[Error: {e}]"
            })
    
    return pd.DataFrame(results)

## Run & Compare (EN)

In [42]:
MAX_SAMPLES = None # Set to int to limit, e.g. 3

samples = get_all_samples_en()
if MAX_SAMPLES is not None:
    samples = samples[:MAX_SAMPLES]

print(f"Running comparison on {len(samples)} samples...")

# Run Prompt A
print("Running Prompt A...")
df_a = run_prompt_on_samples_en(PROMPT_A_SYSTEM, PROMPT_USER_TEMPLATE, GROQ_API_KEY, samples=samples)

# Run Prompt B
print("Running Prompt B...")
df_b = run_prompt_on_samples_en(PROMPT_B_SYSTEM, PROMPT_B_USER_TEMPLATE, GROQ_API_KEY, samples=samples)

# Merge
comparison_df = df_a[["filename", "category", "original_text", "output"]].copy()
comparison_df = comparison_df.rename(columns={"output": "output_a"})
comparison_df["output_b"] = df_b["output"]

# Display DataFrame
# (If running in VS Code or Jupyter, this renders the table)
try:
    from IPython.display import display
    display(comparison_df)
except ImportError:
    print(comparison_df)

# Print Logic for easy reading
print("\n" + "="*80)
print("DETAILED COMPARISON")
print("="*80)

for index, row in comparison_df.iterrows():
    print(f"\nüìÇ File: {row['filename']} ({row['category']})")
    print("-" * 40)
    print("ORIGINAL:")
    # Truncate original for cleaner display if very long
    print(row['original_text'][:200] + "..." if len(row['original_text']) > 200 else row['original_text'])
    print("-" * 40)
    print("PROMPT A OUTPUT:")
    print(row['output_a'])
    print("-" * 40)
    print("PROMPT B OUTPUT:")
    print(row['output_b'])
    print("="*80)

Running comparison on 6 samples...
Running Prompt A...
Running Prompt B...


Unnamed: 0,filename,category,original_text,output_a,output_b
0,en_academic.txt,Academic,The implementation of sustainable development ...,"You want a better world. \n\nTo get it, you ne...",We need to understand how things are connected...
1,en_government.txt,Government,Pursuant to the provisions of Section 504 of t...,You get help from the government. \nThe govern...,You have rights. \nThe law says you cannot be ...
2,en_insurance.txt,Insurance,Coverage under this policy shall be contingent...,You must follow all rules in this policy.\n\nI...,You must follow all rules in this policy.\n\nI...
3,en_legal.txt,Legal,The obligations contained herein shall remain ...,You must keep secrets forever. \n\nThis means ...,You must keep secrets forever.\n\nThis means y...
4,en_medical.txt,Medical,Patients presenting with symptoms indicative o...,You have bad heart symptoms. \n\n* Take aspiri...,You have a heart problem. \n\nYou have bad sym...
5,en_technical.txt,Technical,"Prior to initializing the system, ensure that ...","You start the system. \nFirst, plug in all par...","Turn on the system. \nFirst, plug in all devic..."



DETAILED COMPARISON

üìÇ File: en_academic.txt (Academic)
----------------------------------------
ORIGINAL:
The implementation of sustainable development goals necessitates a comprehensive understanding of the multifaceted interdependencies between economic growth, environmental conservation, and social equ...
----------------------------------------
PROMPT A OUTPUT:
You want a better world. 

To get it, you need to know how things are connected. 
This means knowing how money (economic growth), the earth (environment), and people (social equity) work together.

You must make big changes. 
These changes must happen in all areas of life. 

Some areas are:
* How we make energy
* How we travel
* How we grow food
----------------------------------------
PROMPT B OUTPUT:
We need to understand how things are connected. 
This means knowing how money (economic growth), the earth (environment), and people (social equity) affect each other.

To make a better world, we must make big changes. 
T

## Evaluation (TODO)

In [43]:
def evaluate_outputs(df: pd.DataFrame) -> pd.DataFrame:
    """
    TODO: Add rubric-based evaluation (format compliance, sentence length, etc.)
    """
    raise NotImplementedError