In [None]:
import json
import pandas as pd
from datasets import load_metric

# Load dev data
with open('dev.json', 'r') as f:
    dev_data = json.load(f)

# Load predictions
with open('predictions.json', 'r') as f:
    predictions = json.load(f)

def evaluate_task(task_name, metric, dev_data, predictions):
    labels = [item['label'] for item in dev_data[task_name]]
    preds = [item['prediction'] for item in predictions[task_name]]
    return metric.compute(predictions=preds, references=labels)

tasks = ['sst2', 'qqp', 'mnli', 'mnli-mm', 'qnli', 'rte']
task_to_metric = {
    "sst2": "accuracy",
    "qqp": "f1",
    "mnli": "accuracy",
    "mnli-mm": "accuracy",
    "qnli": "accuracy",
    "rte": "accuracy"
}

results = {}

for task in tasks:
    metric = load_metric("glue", task if task != 'mnli-mm' else 'mnli')
    result = evaluate_task(task, metric, dev_data, predictions)
    results[task] = result

if 'mnli' in results and 'mnli-mm' in results:
    combined_mnli_score = (results['mnli']['accuracy'] + results['mnli-mm']['accuracy']) / 2
    results['mnli_combined'] = {'accuracy': combined_mnli_score}

# Convert results to DataFrame
results_list = []
for task, result in results.items():
    for metric_name, value in result.items():
        results_list.append({'task': task, 'metric': metric_name, 'value': value})

df_results = pd.DataFrame(results_list)

df_results.to_csv('evaluation_results.csv', index=False)

print(df_results)

In [3]:
df_results

Unnamed: 0,task,metric,value
0,sst2,accuracy,0.628378
1,qqp,accuracy,0.705128
2,qqp,f1,0.634921
3,mnli,accuracy,0.661157
4,mnli-mm,accuracy,0.530864
5,qnli,accuracy,0.513514
6,rte,accuracy,0.481481
7,mnli_combined,accuracy,0.596011
