This is a demo code for calculating metrics for the MLLM-as-a-Judge paper. You can transform your results in a *.jsonl* file, and then use this code to calculate the metrics.

For *Batch* setting, we use Levenshtein Distance to calculate the distance between the predicted and the target.

For *Pair* setting, we include code for Tie and Non-Tie cases, and treat it as a classification problem. The metric is *Accuracy*.

For *Score* setting, we use *Pearson Similarity* to calculate the similarity between the predicted and the target.

# Batch Ranking (Levenshtein Distance)

In [None]:
import json
import numpy as np
from Levenshtein import distance

def calculate_levenshtein_metrics(pred_file, gt_file):
    """
    Calculate edit distance metrics between predictions and ground truth
    
    Args:
        pred_file: Path to predictions file
        gt_file: Path to ground truth file
        
    Returns:
        Dictionary containing metrics per dataset and overall average
    """
    # Load ground truth
    gt_data = {}
    with open(gt_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            gt_data[item['id']] = {
                'dataset': item['original_dataset'],
                'answers': item['human']
            }
            
    # Load predictions
    pred_data = {}
    with open(pred_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            pred_data[item['id']] = item['human']
            
    # Calculate distances per dataset
    dataset_distances = {}
    for id in gt_data:
        if id not in pred_data:
            continue
            
        dataset = gt_data[id]['dataset']
        if dataset not in dataset_distances:
            dataset_distances[dataset] = []
            
        # Calculate distance to ground truth answer
        dist = distance(pred_data[id], gt_data[id]['answers'])
        dataset_distances[dataset].append(dist)
    
    # Calculate averages
    metrics = {}
    all_distances = []
    
    for dataset in dataset_distances:
        avg = np.mean(dataset_distances[dataset])
        metrics[dataset] = {
            'average_distance': avg,
            'num_samples': len(dataset_distances[dataset])
        }
        all_distances.extend(dataset_distances[dataset])
        
    metrics['overall'] = {
        'average_distance': np.mean(all_distances),
        'num_samples': len(all_distances)
    }
    
    return metrics 

# Example usage:

pred_file = '<your_prediction_file>'  # User needs to provide prediction file path
gt_file = '../Dataset/Benchmark/batch.jsonl'   # User needs to provide ground truth file path

metrics = calculate_levenshtein_metrics(pred_file, gt_file)

# Print results
print("\nResults per dataset:")
for dataset in metrics:
    if dataset != 'overall':
        print(f"\n{dataset}:")
        print(f"Average Levenshtein distance: {metrics[dataset]['average_distance']:.2f}")
        print(f"Number of samples: {metrics[dataset]['num_samples']}")
        
print(f"\nOverall average distance: {metrics['overall']['average_distance']:.2f}")
print(f"Total samples: {metrics['overall']['num_samples']}")


# Pair Comparison (Accuracy)

In [None]:
import json
import numpy as np

def calculate_accuracy_metrics(pred_file, gt_file, include_ties=False):
    """
    Calculate accuracy metrics between predictions and ground truth
    
    Args:
        pred_file: Path to predictions file
        gt_file: Path to ground truth file
        include_ties: Whether to include samples with tie (C) answers in calculation
        
    Returns:
        Dictionary containing metrics per dataset and overall average
    """
    # Load ground truth
    gt_data = {}
    with open(gt_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            gt_data[item['id']] = {
                'dataset': item['original_dataset'],
                'answer': item['human']
            }
            
    # Load predictions
    pred_data = {}
    with open(pred_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            pred_data[item['id']] = item['human']
            
    # Calculate accuracies per dataset
    dataset_accuracies = {}
    for id in gt_data:
        if id not in pred_data:
            continue
            
        # Skip ties if include_ties is False
        if not include_ties and (gt_data[id]['answer'] == 'C' or pred_data[id] == 'C'):
            continue
            
        dataset = gt_data[id]['dataset']
        if dataset not in dataset_accuracies:
            dataset_accuracies[dataset] = {
                'correct': 0,
                'total': 0
            }
            
        # Compare predictions with ground truth
        if pred_data[id] == gt_data[id]['answer']:
            dataset_accuracies[dataset]['correct'] += 1
        dataset_accuracies[dataset]['total'] += 1
    
    # Calculate metrics
    metrics = {}
    total_correct = 0
    total_samples = 0
    
    for dataset in dataset_accuracies:
        correct = dataset_accuracies[dataset]['correct']
        total = dataset_accuracies[dataset]['total']
        
        accuracy = correct / total if total > 0 else float('nan')
            
        metrics[dataset] = {
            'accuracy': accuracy,
            'num_samples': total
        }
        
        total_correct += correct
        total_samples += total
        
    # Calculate overall accuracy
    metrics['overall'] = {
        'accuracy': total_correct / total_samples if total_samples > 0 else float('nan'),
        'num_samples': total_samples
    }
    
    return metrics

# Example usage:
pred_file = '../Dataset/Benchmark/pair.jsonl'  # User needs to provide prediction file path
gt_file = '../Dataset/Benchmark/pair.jsonl'   # User needs to provide ground truth file path

# Calculate metrics excluding ties
metrics_no_ties = calculate_accuracy_metrics(pred_file, gt_file, include_ties=False)

print("\nResults excluding ties:")
print("\nResults per dataset:")
for dataset in metrics_no_ties:
    if dataset != 'overall':
        print(f"\n{dataset}:")
        print(f"Accuracy: {metrics_no_ties[dataset]['accuracy']:.3f}")
        print(f"Number of samples: {metrics_no_ties[dataset]['num_samples']}")
        
print(f"\nOverall accuracy: {metrics_no_ties['overall']['accuracy']:.3f}")
print(f"Total samples: {metrics_no_ties['overall']['num_samples']}")

# Calculate metrics including ties
metrics_with_ties = calculate_accuracy_metrics(pred_file, gt_file, include_ties=True)

print("\nResults including ties:")
print("\nResults per dataset:")
for dataset in metrics_with_ties:
    if dataset != 'overall':
        print(f"\n{dataset}:")
        print(f"Accuracy: {metrics_with_ties[dataset]['accuracy']:.3f}")
        print(f"Number of samples: {metrics_with_ties[dataset]['num_samples']}")
        
print(f"\nOverall accuracy: {metrics_with_ties['overall']['accuracy']:.3f}")
print(f"Total samples: {metrics_with_ties['overall']['num_samples']}")


# Score (Pearson Similarity)
You can also add more metrics to your results such as cosine similarity, MSE, MAE, Spearman's rank correlation, etc.

In [None]:
from scipy.stats import pearsonr
import json

def calculate_pearson_metrics(pred_file, gt_file):
    """
    Calculate Pearson correlation metrics between predictions and ground truth scores
    
    Args:
        pred_file: Path to predictions file
        gt_file: Path to ground truth file
        
    Returns:
        Dictionary containing metrics per dataset and overall average
    """
    # Load ground truth
    gt_data = {}
    with open(gt_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            gt_data[item['id']] = {
                'dataset': item['original_dataset'],
                'score': float(item['human'])  # Convert string score to float
            }
            
    # Load predictions
    pred_data = {}
    with open(pred_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            pred_data[item['id']] = float(item['human'])  # Convert string score to float
            
    # Calculate correlations per dataset
    dataset_scores = {}
    for id in gt_data:
        if id not in pred_data:
            continue
            
        dataset = gt_data[id]['dataset']
        if dataset not in dataset_scores:
            dataset_scores[dataset] = {
                'pred': [],
                'gt': []
            }
            
        dataset_scores[dataset]['pred'].append(pred_data[id])
        dataset_scores[dataset]['gt'].append(gt_data[id]['score'])
    
    # Calculate metrics
    metrics = {}
    all_pred = []
    all_gt = []
    
    for dataset in dataset_scores:
        corr, p_value = pearsonr(dataset_scores[dataset]['pred'], dataset_scores[dataset]['gt'])
        metrics[dataset] = {
            'correlation': corr,
            'p_value': p_value,
            'num_samples': len(dataset_scores[dataset]['pred'])
        }
        all_pred.extend(dataset_scores[dataset]['pred'])
        all_gt.extend(dataset_scores[dataset]['gt'])
        
    # Calculate overall correlation
    overall_corr, overall_p = pearsonr(all_pred, all_gt)
    metrics['overall'] = {
        'correlation': overall_corr,
        'p_value': overall_p,
        'num_samples': len(all_pred)
    }
    
    return metrics

# %% Example usage

pred_file = '../Dataset/Benchmark/score.jsonl'  # User needs to provide prediction file path
gt_file = '../Dataset/Benchmark/score.jsonl'   # User needs to provide ground truth file path

# Calculate Pearson correlation metrics
pearson_metrics = calculate_pearson_metrics(pred_file, gt_file)

print("\nPearson Correlation Results:")
print("\nResults per dataset:")
for dataset in pearson_metrics:
    if dataset != 'overall':
        print(f"\n{dataset}:")
        print(f"Correlation: {pearson_metrics[dataset]['correlation']:.3f}")
        print(f"P-value: {pearson_metrics[dataset]['p_value']:.3e}")
        print(f"Number of samples: {pearson_metrics[dataset]['num_samples']}")
        
print(f"\nOverall correlation: {pearson_metrics['overall']['correlation']:.3f}")
print(f"Overall p-value: {pearson_metrics['overall']['p_value']:.3e}")
print(f"Total samples: {pearson_metrics['overall']['num_samples']}")
