In [1]:
from pathlib import Path
import json

data = [json.loads(line) for line in Path("llama-70b-solution-classifier-test.jsonl").read_text().splitlines()]

In [2]:
correct = [d for d in data if d["is_correct"]]
incorrect = [d for d in data if not d["is_correct"]]

In [3]:
print(f"Accuracy: {len(correct) / len(data):.2%}")

Accuracy: 57.07%


In [30]:
from collections import defaultdict

def print_accuracy_by_pred(data, pred):
    is_correct_by_pred = defaultdict(lambda: defaultdict(int))
    for d in data:
        is_correct_by_pred[pred(d)][d["is_correct"]] += 1

    accuracy_by_category = {
        category: counts[True] / (counts[True] + counts[False])
        for category, counts in is_correct_by_pred.items()
    }

    for key, accuracy in sorted(accuracy_by_category.items()):
        print(f"{key}: {accuracy:.2%}")

In [31]:
print_accuracy_by_pred(data, lambda d: f"{d['domain']}/{d['problem_class']}")

algebra/simplify_radicals: 68.42%
counting_and_statistics/mean: 50.00%
counting_and_statistics/median: 54.55%
number_theory/gcd: 53.85%


In [32]:
print_accuracy_by_pred(data, lambda d: d["steps"])


1: 57.14%
2: 63.64%
3: 58.14%
4: 56.52%
5: 50.00%
6: 75.00%


In [33]:
print_accuracy_by_pred(data, lambda d: "has_error" if d["solution_error"] is not None else "no_error")

has_error: 54.29%
no_error: 60.00%


In [None]:
p = incorrect[14]
print(p["problem"], end="\n\n")
print(p["solution"], end="\n\n")
print("Solution error:", p["solution_error"])