In [1]:
import json
import numpy as np
from tabulate import tabulate

In [2]:
age_dict = {
    "A": 1.5,
    "B": 4.5,
    "C": 6.5,
    "D": 8.5,
    "E": 11.0,
    "F": 14.5,
}

In [3]:
# english analysis
criteria = ["grammar", "creativity", "consistency", "age"]
models = ["gpt", "ts", "own_small"]
scores = {}
for model_id in models:
    # print(f"Analyzing {model_id}-model.")
    with open(f"evaluations_en_{model_id}.json", "r") as f:
        evaluations = json.load(f)
    scores[model_id] = {
        "grammar": [],
        "creativity": [],
        "consistency": [],
        "age": [],
        }
    for ev in evaluations:
        s = ev["eval_step2"].replace(":", "").replace(",", "").split(" ")
        for i in range(0, 7, 2):
            if i == 6:
                scores[model_id][s[i]].append(age_dict[s[i+1]])
            else:
                nom, denom = s[i+1].split("/")
                scores[model_id][s[i]].append(float(nom) / float(denom))
headers = ["model"] + models
t = []
for crit in criteria:
    row = [crit]
    for m in models:
        row.append(f"{np.round(np.mean(scores[m][crit]), 4)} +/- {np.round(np.std(scores[m][crit]), 4)}")
    t.append(row)
print(tabulate(t, headers=headers))

model        gpt               ts                own_small
-----------  ----------------  ----------------  ----------------
grammar      0.918 +/- 0.0384  0.612 +/- 0.1851  0.187 +/- 0.0336
creativity   0.807 +/- 0.0255  0.491 +/- 0.1217  0.32 +/- 0.0775
consistency  0.965 +/- 0.0477  0.288 +/- 0.1505  0.1 +/- 0.0
age          11.0 +/- 0.0      9.215 +/- 1.4129  5.96 +/- 0.8879


In [4]:
# german analysis
criteria = ["Grammatik", "Kreativität", "Konsistenz", "Alter"]
models = ["gpt", "own_small", "own_big"]
scores = {}
for model_id in models:
    # print(f"Analyzing {model_id}-model.")
    with open(f"evaluations_de_{model_id}.json", "r") as f:
        evaluations = json.load(f)
    scores[model_id] = {
        "Grammatik": [],
        "Kreativität": [],
        "Konsistenz": [],
        "Alter": [],
        }
    for ev in evaluations:
        s = ev["eval_step2"].replace(":", "").replace(",", "").split(" ")
        for i in range(0, 7, 2):
            if i == 6:
                scores[model_id][s[i]].append(age_dict[s[i+1]])
            else:
                nom, denom = s[i+1].split("/")
                scores[model_id][s[i]].append(float(nom) / float(denom))
headers = ["model"] + models
t = []
for crit in criteria:
    row = [crit]
    for m in models:
        row.append(f"{np.round(np.mean(scores[m][crit]), 4)} +/- {np.round(np.std(scores[m][crit]), 4)}")
    t.append(row)
print(tabulate(t, headers=headers))

model        gpt               own_small         own_big
-----------  ----------------  ----------------  ----------------
Grammatik    0.922 +/- 0.0438  0.183 +/- 0.0448  0.566 +/- 0.1608
Kreativität  0.81 +/- 0.0332   0.302 +/- 0.0663  0.578 +/- 0.1073
Konsistenz   0.924 +/- 0.0427  0.102 +/- 0.014   0.308 +/- 0.1798
Alter        11.0 +/- 0.0      5.55 +/- 1.9805   10.71 +/- 0.906
