In [1]:
# Load the evaluation dataset
import json

# Load the evaluation data
with open("lora-best_model_response.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)

print(f"Loaded {len(eval_data)} evaluation samples")
print(f"\nSample entry structure:")
print(f"Keys: {list(eval_data[0].keys())}")
print(f"\nFirst sample:")
print(f"Instruction: {eval_data[0]['instruction']}")
print(f"Input: {eval_data[0]['input']}")
print(f"Expected Output: {eval_data[0]['output'][:100]}...")

Loaded 50 evaluation samples

Sample entry structure:
Keys: ['instruction', 'input', 'output', 'model_response']

First sample:
Instruction: Given a sentence, add in 4 adjectives that describe the sentence.
Input: He was an astrophysicist.
Expected Output: He was a brilliant, inquisitive, ambitious, and passionate astrophysicist....


In [2]:
# Inspect a few samples from the evaluation dataset
print("=" * 80)
print("SAMPLE EVALUATION ENTRIES")
print("=" * 80)

for i, entry in enumerate(eval_data[:3]):
    print(f"\n--- Sample {i+1} ---")
    print(f"Instruction: {entry['instruction']}")
    if entry['input']:
        print(f"Input: {entry['input']}")
    print(f"Expected Output: {entry['output']}")
    if 'model_response' in entry:
        print(f"Previous Model Response: {entry['model_response']}")
    print()

SAMPLE EVALUATION ENTRIES

--- Sample 1 ---
Instruction: Given a sentence, add in 4 adjectives that describe the sentence.
Input: He was an astrophysicist.
Expected Output: He was a brilliant, inquisitive, ambitious, and passionate astrophysicist.
Previous Model Response: He was an astrophysicist with a passion for science.


--- Sample 2 ---
Instruction: Given a sentence, explain why this statement could be problematic.
Input: Women should stay at home and take care of the children.
Expected Output: This statement could be problematic because it implies that women are not capable of pursuing careers or other interests outside of the home. In addition, it implies that men are not responsible for taking care of the children, which is not fair or equitable.
Previous Model Response: This statement could be problematic because it implies that women should be at home and take care of the children, which is a sexist and harmful statement.


--- Sample 3 ---
Instruction: Describe the followin

In [3]:
from openai import OpenAI
import re
from tqdm import tqdm

# Initialize OpenAI client
client = OpenAI(
    base_url="https://lightning.ai/api/v1/",
    api_key="XXXXXXXX",
)

print("OpenAI client initialized successfully!")

OpenAI client initialized successfully!


In [4]:
def create_judge_prompt(entry):
    """
    Create a prompt for the LLM judge to evaluate model response quality.
    
    Args:
        entry: Dictionary containing instruction, input, output (expected), and model_response
    
    Returns:
        Formatted prompt string for the judge
    """
    instruction = entry['instruction']
    user_input = entry['input']
    expected_output = entry['output']
    model_response = entry['model_response']
    
    prompt = f"""You are an expert evaluator assessing the quality of AI model responses. 
Your task is to determine if the model's response correctly addresses the given instruction.

**Instruction:** {instruction}

**Input:** {user_input if user_input else "N/A"}

**Expected Response:** {expected_output}

**Model's Response:** {model_response}

**Evaluation Criteria:**
1. Does the model's response correctly address the instruction?
2. Is the response factually accurate and relevant?
3. Does it align with the expected response in meaning and quality?

**Instructions:**
- Compare the model's response with the expected response
- Consider if the model's response is semantically equivalent or of similar quality
- Minor wording differences are acceptable if the meaning is correct
- Output your evaluation in this EXACT format:

VERDICT: [CORRECT/INCORRECT]
EXPLANATION: [Brief explanation of your decision in 1-2 sentences]

Respond now with your evaluation:"""
    
    return prompt

# Test the prompt creation
test_entry = eval_data[0]
test_prompt = create_judge_prompt(test_entry)
print("Sample Judge Prompt:")
print("=" * 80)
print(test_prompt[:500] + "...")
print("=" * 80)


Sample Judge Prompt:
You are an expert evaluator assessing the quality of AI model responses. 
Your task is to determine if the model's response correctly addresses the given instruction.

**Instruction:** Given a sentence, add in 4 adjectives that describe the sentence.

**Input:** He was an astrophysicist.

**Expected Response:** He was a brilliant, inquisitive, ambitious, and passionate astrophysicist.

**Model's Response:** He was an astrophysicist with a passion for science.

**Evaluation Criteria:**
1. Does th...


In [5]:
def evaluate_single_entry(entry, client, model="openai/gpt-5-nano"):
    """
    Evaluate a single entry using the LLM judge.
    
    Args:
        entry: Dictionary containing instruction, input, output, and model_response
        client: OpenAI client instance
        model: Model to use for judging
    
    Returns:
        Dictionary with verdict (bool), explanation (str), and raw_response (str)
    """
    try:
        # Create judge prompt
        prompt = create_judge_prompt(entry)
        
        # Call OpenAI API
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "user",
                    "content": [{"type": "text", "text": prompt}]
                },
            ],
            temperature=1,  # Lower temperature for more consistent judgments
        )
        
        # Get response
        response = completion.choices[0].message.content
        
        # Parse verdict
        verdict_match = re.search(r'VERDICT:\s*(CORRECT|INCORRECT)', response, re.IGNORECASE)
        explanation_match = re.search(r'EXPLANATION:\s*(.+?)(?:\n\n|\Z)', response, re.DOTALL | re.IGNORECASE)
        
        is_correct = verdict_match and verdict_match.group(1).upper() == "CORRECT"
        explanation = explanation_match.group(1).strip() if explanation_match else "No explanation provided"
        
        return {
            "is_correct": is_correct,
            "explanation": explanation,
            "raw_response": response
        }
    
    except Exception as e:
        print(f"Error evaluating entry: {e}")
        return {
            "is_correct": False,
            "explanation": f"Error during evaluation: {str(e)}",
            "raw_response": ""
        }

# Test on a single entry
print("Testing LLM Judge on first entry...")
print("=" * 80)
result = evaluate_single_entry(eval_data[0], client)
print(f"Verdict: {'✓ CORRECT' if result['is_correct'] else '✗ INCORRECT'}")
print(f"Explanation: {result['explanation']}")
print("=" * 80)


Testing LLM Judge on first entry...


Verdict: ✗ INCORRECT
Explanation: The model did not add four adjectives describing the sentence; it rewrites with a phrase “with a passion for science” rather than four descriptive adjectives, failing to meet the instruction.


In [6]:
def llm_judge_evaluation(eval_data, client, model="openai/gpt-5-nano", max_samples=None):
    """
    Evaluate all entries using LLM-as-a-Judge approach.
    
    Args:
        eval_data: List of evaluation entries
        client: OpenAI client instance
        model: Model to use for judging
        max_samples: Maximum number of samples to evaluate (None for all)
    
    Returns:
        Dictionary containing:
            - results: List of evaluation results for each entry
            - accuracy: Overall accuracy percentage
            - correct_count: Number of correct responses
            - total_count: Total number of evaluated responses
    """
    # Limit samples if specified
    data_to_eval = eval_data[:max_samples] if max_samples else eval_data
    
    print(f"Starting LLM-as-a-Judge evaluation on {len(data_to_eval)} samples...")
    print(f"Using model: {model}")
    print("=" * 80)
    
    results = []
    correct_count = 0
    
    # Evaluate each entry
    for i, entry in enumerate(tqdm(data_to_eval, desc="Evaluating")):
        result = evaluate_single_entry(entry, client, model)
        
        # Add entry info to result
        result['index'] = i
        result['instruction'] = entry['instruction']
        result['expected_output'] = entry['output']
        result['model_response'] = entry['model_response']
        
        results.append(result)
        
        if result['is_correct']:
            correct_count += 1
    
    # Calculate accuracy
    total_count = len(results)
    accuracy = (correct_count / total_count * 100) if total_count > 0 else 0
    
    # Print summary
    print("=" * 80)
    print("EVALUATION COMPLETE")
    print("=" * 80)
    print(f"Total Samples Evaluated: {total_count}")
    print(f"Correct Responses: {correct_count}")
    print(f"Incorrect Responses: {total_count - correct_count}")
    print(f"Accuracy: {accuracy:.2f}%")
    print("=" * 80)
    
    return {
        "results": results,
        "accuracy": accuracy,
        "correct_count": correct_count,
        "total_count": total_count
    }

# Example: Run evaluation on first 10 samples
# Uncomment to run:
# eval_results = llm_judge_evaluation(eval_data, client, max_samples=10)


In [7]:
def display_evaluation_samples(eval_results, num_samples=5, show_correct=True, show_incorrect=True):
    """
    Display sample evaluation results.
    
    Args:
        eval_results: Dictionary returned by llm_judge_evaluation
        num_samples: Number of samples to display
        show_correct: Whether to show correct predictions
        show_incorrect: Whether to show incorrect predictions
    """
    results = eval_results['results']
    
    # Filter results
    filtered_results = []
    if show_correct:
        filtered_results.extend([r for r in results if r['is_correct']])
    if show_incorrect:
        filtered_results.extend([r for r in results if not r['is_correct']])
    
    # Limit to num_samples
    filtered_results = filtered_results[:num_samples]
    
    for i, result in enumerate(filtered_results):
        status = "✓ CORRECT" if result['is_correct'] else "✗ INCORRECT"
        print(f"\n{'='*80}")
        print(f"Sample {result['index'] + 1} - {status}")
        print(f"{'='*80}")
        print(f"Instruction: {result['instruction']}")
        print(f"\nExpected: {result['expected_output'][:200]}...")
        print(f"\nModel Response: {result['model_response'][:200]}...")
        print(f"\nJudge's Explanation: {result['explanation']}")
        print()

# Example usage:
# display_evaluation_samples(eval_results, num_samples=3, show_incorrect=True)


In [8]:
def save_evaluation_results(eval_results, filename="llm_judge_results.json"):
    """
    Save evaluation results to a JSON file.
    
    Args:
        eval_results: Dictionary returned by llm_judge_evaluation
        filename: Output filename
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(eval_results, f, indent=2, ensure_ascii=False)
    
    print(f"✓ Saved evaluation results to '{filename}'")
    print(f"  - Total samples: {eval_results['total_count']}")
    print(f"  - Accuracy: {eval_results['accuracy']:.2f}%")

def create_evaluation_report(eval_results):
    """
    Create a detailed text report of the evaluation.
    
    Args:
        eval_results: Dictionary returned by llm_judge_evaluation
    
    Returns:
        String containing the formatted report
    """
    report = []
    report.append("="*80)
    report.append("LLM-AS-A-JUDGE EVALUATION REPORT")
    report.append("="*80)
    report.append(f"\nTotal Samples Evaluated: {eval_results['total_count']}")
    report.append(f"Correct Responses: {eval_results['correct_count']}")
    report.append(f"Incorrect Responses: {eval_results['total_count'] - eval_results['correct_count']}")
    report.append(f"Overall Accuracy: {eval_results['accuracy']:.2f}%")
    report.append("\n" + "="*80)
    report.append("INCORRECT PREDICTIONS ANALYSIS")
    report.append("="*80)
    
    incorrect_results = [r for r in eval_results['results'] if not r['is_correct']]
    
    for i, result in enumerate(incorrect_results[:10]):  # Show first 10 incorrect
        report.append(f"\n{i+1}. Sample #{result['index'] + 1}")
        report.append(f"   Instruction: {result['instruction'][:100]}...")
        report.append(f"   Judge's Explanation: {result['explanation']}")
        report.append("")
    
    if len(incorrect_results) > 10:
        report.append(f"\n... and {len(incorrect_results) - 10} more incorrect predictions")
    
    return "\n".join(report)

# Example usage:
# save_evaluation_results(eval_results, "my_evaluation_results.json")
# print(create_evaluation_report(eval_results))


## How to Run the Complete Evaluation

Now you have a complete LLM-as-a-Judge evaluation system! Here's how to use it:

### Step 1: Quick Test (5-10 samples)
Test on a small subset first to make sure everything works:
```python
eval_results = llm_judge_evaluation(eval_data, client, max_samples=10)
```

### Step 2: View Sample Results
```python
display_evaluation_samples(eval_results, num_samples=3, show_incorrect=True)
```

### Step 3: Full Evaluation
Run on all samples (this may take a while):
```python
eval_results = llm_judge_evaluation(eval_data, client, max_samples=None)
```

### Step 4: Save and Report
```python
save_evaluation_results(eval_results, "my_model_evaluation.json")
print(create_evaluation_report(eval_results))
```

### What the System Does:
1. ✅ Takes each entry from your evaluation dataset
2. ✅ Creates a structured prompt for the LLM judge
3. ✅ Calls OpenAI API to evaluate if model response matches expected output
4. ✅ Parses the judge's verdict (CORRECT/INCORRECT)
5. ✅ Calculates overall accuracy percentage
6. ✅ Provides detailed explanations for each evaluation
7. ✅ Saves results to JSON for further analysis


In [10]:
# RUN EVALUATION HERE
# Uncomment the lines below to run the evaluation

# Step 1: Test on a small sample first (recommended)
eval_results = llm_judge_evaluation(eval_data, client, max_samples=50)

# Step 2: View some results
#display_evaluation_samples(eval_results, num_samples=3, show_incorrect=True)

# Step 3: Save results
save_evaluation_results(eval_results, "evaluation_results_sample.json")

# Step 4: Print report
print("\n" + create_evaluation_report(eval_results))


Starting LLM-as-a-Judge evaluation on 50 samples...
Using model: openai/gpt-5-nano


Evaluating:  98%|█████████▊| 49/50 [03:20<00:02,  2.72s/it]

Error evaluating entry: Rate limit reached. Upgrade to the next tier for more requests: https://lightning.ai/pricing/


Evaluating: 100%|██████████| 50/50 [03:24<00:00,  4.08s/it]

EVALUATION COMPLETE
Total Samples Evaluated: 50
Correct Responses: 7
Incorrect Responses: 43
Accuracy: 14.00%
✓ Saved evaluation results to 'evaluation_results_sample.json'
  - Total samples: 50
  - Accuracy: 14.00%

LLM-AS-A-JUDGE EVALUATION REPORT

Total Samples Evaluated: 50
Correct Responses: 7
Incorrect Responses: 43
Overall Accuracy: 14.00%

INCORRECT PREDICTIONS ANALYSIS

1. Sample #1
   Instruction: Given a sentence, add in 4 adjectives that describe the sentence....
   Judge's Explanation: The model did not add four adjectives describing the sentence; it rewrote it with a non-descriptive phrase and no four adjectives, thus not meeting the instruction.


2. Sample #3
   Instruction: Describe the following landscape with words...
   Judge's Explanation: The response merely restates the elements and offers shallow, conflicting descriptions (a "wide valley below" and a "small mountain in the center"), failing to provide the vivid, cohesive description and imagery present in the ex




In [None]:
# FULL EVALUATION (Run this after testing with small samples)
# This will evaluate ALL 500 samples - may take several minutes

# Uncomment to run full evaluation:
# print("Starting full evaluation on all 500 samples...")
# print("This may take 5-10 minutes depending on API speed...")
# 
# full_eval_results = llm_judge_evaluation(eval_data, client, max_samples=None)
# 
# # Save full results
# save_evaluation_results(full_eval_results, "full_evaluation_results.json")
# 
# # Generate and save report
# report = create_evaluation_report(full_eval_results)
# print("\n" + report)
# 
# with open("evaluation_report.txt", "w", encoding="utf-8") as f:
#     f.write(report)
# print("\n✓ Report saved to 'evaluation_report.txt'")


In [14]:
# Visualization: Show evaluation statistics
def plot_evaluation_stats(eval_results):
    """
    Display evaluation statistics in a formatted way.
    """
    correct = eval_results['correct_count']
    incorrect = eval_results['total_count'] - correct
    accuracy = eval_results['accuracy']
    
    print("\n" + "="*80)
    print("EVALUATION STATISTICS")
    print("="*80)
    print(f"\n📊 Total Samples: {eval_results['total_count']}")
    print(f"✅ Correct: {correct} ({correct/eval_results['total_count']*100:.1f}%)")
    print(f"❌ Incorrect: {incorrect} ({incorrect/eval_results['total_count']*100:.1f}%)")
    print(f"\n🎯 Overall Accuracy: {accuracy:.2f}%")
    
    # ASCII bar chart
    bar_length = 50
    correct_bar = int(bar_length * correct / eval_results['total_count'])
    incorrect_bar = bar_length - correct_bar
    
    print("\n" + "="*80)
    print("Visual Breakdown:")
    print("="*80)
    print(f"Correct:   [{'█' * correct_bar}{' ' * incorrect_bar}] {correct}")
    print(f"Incorrect: [{' ' * correct_bar}{'█' * incorrect_bar}] {incorrect}")
    print("="*80 + "\n")

# Example usage:
plot_evaluation_stats(eval_results)



EVALUATION STATISTICS

📊 Total Samples: 100
✅ Correct: 9 (9.0%)
❌ Incorrect: 91 (91.0%)

🎯 Overall Accuracy: 9.00%

Visual Breakdown:
Correct:   [████                                              ] 9
Incorrect: [    ██████████████████████████████████████████████] 91

