In [1]:
import json
import os
from pathlib import Path
import pandas as pd


In [2]:
def calculate_accuracy(filepath):
    """
    Calculate accuracy from an evaluation results file.
    
    Args:
        filepath (str): Path to the .eval-results file
    
    Returns:
        dict: Dictionary with accuracy metrics and details
    """
    try:
        with open(filepath, 'r') as f:
            results = [json.loads(line.strip()) for line in f if line.strip()]
        
        if not results:
            return {'error': 'No results found in file'}
        
        # Count correct and total
        correct = 0
        total = len(results)
        
        for result in results:
            if result.get('autoeval_label', {}).get('label', False):
                correct += 1
        
        accuracy = correct / total if total > 0 else 0.0
        
        return {
            'filename': os.path.basename(filepath),
            'total_questions': total,
            'correct_answers': correct,
            'incorrect_answers': total - correct,
            'accuracy': accuracy,
            'accuracy_percentage': accuracy * 100
        }
    
    except Exception as e:
        return {'error': f'Error processing file: {str(e)}'}


In [3]:
def show_sample_results(filepath, num_samples=5):
    """
    Show a few sample results from the file to understand the data.
    
    Args:
        filepath (str): Path to the .eval-results file
        num_samples (int): Number of samples to show
    """
    try:
        with open(filepath, 'r') as f:
            results = [json.loads(line.strip()) for line in f if line.strip()]
        
        print(f"\n📋 Sample Results from {os.path.basename(filepath)}:")
        print("=" * 80)
        
        for i, result in enumerate(results[:num_samples]):
            label = result.get('autoeval_label', {}).get('label', False)
            status = "✅ CORRECT" if label else "❌ INCORRECT"
            
            print(f"\n{i+1}. {status}")
            print(f"   Question ID: {result.get('question_id', 'N/A')}")
            print(f"   Ground Truth: {result.get('ground_truth', 'N/A')}")
            print(f"   Hypothesis: {result.get('hypothesis', 'N/A')[:100]}...")
        
        if len(results) > num_samples:
            print(f"\n... and {len(results) - num_samples} more results")
    
    except Exception as e:
        print(f"Error showing sample results: {e}")


In [17]:
# OPTION 1: Specify a file directly
# Change this path to any evaluation result file you want to analyze
filepath = "/home/samer/Documents/LAU/Research/focus_memgpt/Focused-MemGPT/paper_experiments/longmemeval/evaluation_scripts/full runs with clustering/memgpt_hypotheses_xml_hybrid_beta0.0_cluster_20250716_223138.jsonl.eval-results-gpt-4o-mini"

# Calculate accuracy
result = calculate_accuracy(filepath)

if 'error' in result:
    print(f"❌ Error: {result['error']}")
else:
    print(f"📊 Accuracy Analysis for: {result['filename']}")
    print("=" * 60)
    print(f"Total Questions: {result['total_questions']}")
    print(f"Correct Answers: {result['correct_answers']}")
    print(f"Incorrect Answers: {result['incorrect_answers']}")
    print(f"Accuracy: {result['accuracy']:.4f} ({result['accuracy_percentage']:.2f}%)")
    
    # # Show some sample results
    # show_sample_results(filepath, 3)


📊 Accuracy Analysis for: memgpt_hypotheses_xml_hybrid_beta0.0_cluster_20250716_223138.jsonl.eval-results-gpt-4o-mini
Total Questions: 289
Correct Answers: 140
Incorrect Answers: 149
Accuracy: 0.4844 (48.44%)


In [5]:
# Find all evaluation result files
eval_files = []

for root, dirs, files in os.walk('.'):
    for file in files:
        if file.endswith('.eval-results-gpt-4o-mini') or file.endswith('.eval-results-gpt-4o'):
            eval_files.append(os.path.join(root, file))

eval_files = sorted(eval_files)

print(f"Found {len(eval_files)} evaluation result files:")
print("=" * 60)

for i, filepath in enumerate(eval_files):
    print(f"{i+1:2d}. {filepath}")

# Quick analysis of all files
if eval_files:
    print(f"\n📈 Quick Overview of All Files:")
    print("=" * 60)
    
    all_results = []
    for filepath in eval_files:
        result = calculate_accuracy(filepath)
        if 'error' not in result:
            all_results.append({
                'File': os.path.basename(filepath),
                'Questions': result['total_questions'],
                'Correct': result['correct_answers'],
                'Accuracy': f"{result['accuracy_percentage']:.1f}%"
            })
    
    if all_results:
        df = pd.DataFrame(all_results)
        print(df.to_string(index=False))


Found 39 evaluation result files:
 1. ./archived/memgpt_hypotheses.jsonl.eval-results-gpt-4o-mini
 2. ./archived/memgpt_hypotheses_fifo.jsonl.eval-results-gpt-4o-mini
 3. ./archived/memgpt_hypotheses_hybrid.jsonl.eval-results-gpt-4o-mini
 4. ./archived/memgpt_hypotheses_hybrid_beta1.0.jsonl.eval-results-gpt-4o-mini
 5. ./archived/memgpt_hypotheses_prompted_hybrid_beta1.0_20250715_122759.jsonl.eval-results-gpt-4o-mini
 6. ./archived/memgpt_hypotheses_prompted_hybrid_beta1.0_test.jsonl.eval-results-gpt-4o-mini
 7. ./archived/memgpt_hypotheses_prompted_hybrid_beta1.0_test_20250714_222150.jsonl.eval-results-gpt-4o-mini
 8. ./archived/memgpt_hypotheses_prompted_hybrid_beta1.0_test_20250714_223803.jsonl.eval-results-gpt-4o-mini
 9. ./archived/old_beta/memgpt_hypotheses_hybrid_beta0.0.jsonl.eval-results-gpt-4o-mini
10. ./archived/old_beta/memgpt_hypotheses_hybrid_beta0.1.jsonl.eval-results-gpt-4o-mini
11. ./archived/old_beta/memgpt_hypotheses_hybrid_beta0.2.jsonl.eval-results-gpt-4o-mini
12. 

In [None]:
# Change this number to select different files (1-based indexing)
file_number = 1

if eval_files and 1 <= file_number <= len(eval_files):
    filepath = eval_files[file_number - 1]
    
    # Calculate accuracy
    result = calculate_accuracy(filepath)
    
    if 'error' in result:
        print(f"❌ Error: {result['error']}")
    else:
        print(f"📊 Detailed Analysis for File #{file_number}:")
        print(f"📁 {result['filename']}")
        print("=" * 60)
        print(f"Total Questions: {result['total_questions']}")
        print(f"Correct Answers: {result['correct_answers']}")
        print(f"Incorrect Answers: {result['incorrect_answers']}")
        print(f"Accuracy: {result['accuracy']:.4f} ({result['accuracy_percentage']:.2f}%)")
        
        # Show sample results
        show_sample_results(filepath, 5)
else:
    print(f"❌ Invalid file number. Please choose between 1 and {len(eval_files) if eval_files else 0}")
