In [None]:
from pydantic import BaseModel


class AcronymFeedback(BaseModel):
    relevance: str = str()
    relevance_score: int = 0
    pronunciation: str = str()
    pronunciation_score: int = 0
    spelling: str = str()
    spelling_score: int = 0
    familiarity: str = str()
    familiarity_score: int = 0
    total_score: int = 0


class AcronymIteration(BaseModel):
    n: int = 0
    acronym: str = str()
    feedback: AcronymFeedback = AcronymFeedback()


class AcronymGenOutput(BaseModel):
    title: str
    best: AcronymIteration
    n: int
    iterations: list[AcronymIteration]


class AcronymEvalOutput(BaseModel):
    title: str
    acronym_a: str
    feedback_a: AcronymFeedback
    acronym_b: str
    feedback_b: AcronymFeedback


class DialogFeedback(BaseModel):
    consistency: str = str()
    consistency_score: int = 0
    understand: str = str()
    understand_score: int = 0
    sustain: str = str()
    sustain_score: int = 0
    total_score: int = 0


class DialogIteration(BaseModel):
    n: int = 0
    response: str = str()
    feedback: DialogFeedback = DialogFeedback()


class DialogGenOutput(BaseModel):
    dialog: list[str]
    best: DialogIteration
    n: int
    iterations: list[DialogIteration]


class DialogEvalOutput(BaseModel):
    dialog: list[str]
    response_a: str
    feedback_a: DialogFeedback
    response_b: str
    feedback_b: DialogFeedback


class SentenceFeedback(BaseModel):
    inclusion: str = str()
    inclusion_score: int = 0
    logical: str = str()
    logical_score: int = 0
    total_score: int = 0


class SentenceIteration(BaseModel):
    n: int = 0
    sentence: str = str()
    feedback: SentenceFeedback = SentenceFeedback()


class SentenceGenOutput(BaseModel):
    concepts: list[str]
    best: SentenceIteration
    n: int
    iterations: list[SentenceIteration]


class SentenceEvalOutput(BaseModel):
    concepts: list[str]
    sentence_a: str
    feedback_a: SentenceFeedback
    sentence_b: str
    feedback_b: SentenceFeedback


class SentimentFeedback(BaseModel):
    effective: str = str()
    effective_score: int = 0
    logical: str = str()
    logical_score: int = 0
    total_score: int = 0


class SentimentIteration(BaseModel):
    n: int = 0
    reversed_review: str = str()
    feedback: SentimentFeedback = SentimentFeedback()


class SentimentGenOutput(BaseModel):
    review: str
    best: SentimentIteration
    n: int
    iterations: list[SentimentIteration]


class SentimentEvalOutput(BaseModel):
    review: str
    reversed_review_a: str
    feedback_a: SentimentFeedback
    reversed_review_b: str
    feedback_b: SentimentFeedback

In [None]:
tasks = {
    "acronym": (AcronymGenOutput, AcronymEvalOutput, "acronym", 40),
    "dialog": (DialogGenOutput, DialogEvalOutput, "response", 30),
    "sentence": (SentenceGenOutput, SentenceEvalOutput, "sentence", 20),
    "sentiment": (SentimentGenOutput, SentimentEvalOutput, "reversed_review", 20),
}

models = {
    "llama-2-7b": ("llama-2-7b-f.jsonl", "llama-2-7b-f-e.jsonl"),
    "mistral-7b": ("mistral-7b-v0.1-f.jsonl", "mistral-7b-v0.1-f-e.jsonl"),
    "openchat-3.5": ("openchat-3.5-f.jsonl", "openchat-3.5-f-e.jsonl"),
    "gpt-3.5": ("gpt-3.5-f.jsonl", "gpt-3.5-f-e.jsonl"),
}

feedbacks = {
    "full feedback": {"llama-2-7b": "llama-2-7b-f-e.jsonl", "mistral-7b": "mistral-7b-v0.1-f-e.jsonl", "openchat-3.5": "openchat-3.5-f-e.jsonl", "gpt-3.5": "gpt-3.5-f-e.jsonl"},
    "generic feedback": {"llama-2-7b": "llama-2-7b-g-e.jsonl", "mistral-7b": "mistral-7b-v0.1-g-e.jsonl", "openchat-3.5": "openchat-3.5-g-e.jsonl", "gpt-3.5": "gpt-3.5-g-e.jsonl"},
    "no feedback": {"llama-2-7b": "llama-2-7b-m-e.jsonl", "mistral-7b": "mistral-7b-v0.1-m-e.jsonl", "openchat-3.5": "openchat-3.5-m-e.jsonl", "gpt-3.5": "gpt-3.5-m-e.jsonl"},
}

In [None]:
from pathlib import Path

import pandas as pd


def sum_up_scores_by_models(task, model):
    count = 0
    base_total = 0
    refine_total = 0
    with open(Path("outputs") / task / models[model][1], mode="r") as f:
        for line in f:
            obj = tasks[task][1].model_validate_json(line)
            count += 1
            base_total += obj.feedback_a.total_score
            refine_total += obj.feedback_b.total_score
    base_percent = base_total / count * 100 / tasks[task][3]
    refine_percent = refine_total / count * 100 / tasks[task][3]
    return base_percent, refine_percent


def create_score_df_by_models():
    data = list()
    for task in tasks:
        record = dict()
        for model in models:
            base_percent, refine_percent = sum_up_scores_by_models(task, model)
            record[model + " (base)"] = f"{round(base_percent, 1)}"
            record[model + " (self-refine)"] = f"{round(refine_percent, 1)} ({round(refine_percent - base_percent, 1)})"
        data.append(record)
    df = pd.DataFrame(data, index=list(tasks.keys()))
    return df


def sum_up_scores_by_feedbacks(model, feedback):
    task = "sentiment"
    count = 0
    base_total = 0
    refine_total = 0
    with open(Path("outputs") / task / feedbacks[feedback][model], mode="r") as f:
        for line in f:
            obj = tasks[task][1].model_validate_json(line)
            count += 1
            base_total += obj.feedback_a.total_score
            refine_total += obj.feedback_b.total_score
    base_percent = base_total / count * 100 / tasks[task][3]
    refine_percent = refine_total / count * 100 / tasks[task][3]
    return base_percent, refine_percent


def create_score_df_by_feedbacks():
    data = list()
    for model in models:
        record = dict()
        for feedback in feedbacks:
            base_percent, refine_percent = sum_up_scores_by_feedbacks(model, feedback)
            record[feedback + " (base)"] = f"{round(base_percent, 1)}"
            record[feedback + " (self-refine)"] = f"{round(refine_percent, 1)} ({round(refine_percent - base_percent, 1)})"
        data.append(record)
    df = pd.DataFrame(data, index=list(models.keys()))
    return df


def sum_up_refine_count_by_models(task, model):
    count = 0
    refine_count = 0
    with open(Path("outputs") / task / models[model][0], mode="r") as f:
        for line in f:
            obj = tasks[task][0].model_validate_json(line)
            count += 1
            if obj.best.n != 1 and getattr(obj.iterations[0], tasks[task][2]) != getattr(obj.best, tasks[task][2]):
                refine_count += 1
    refine_percent = refine_count / count * 100
    return refine_percent


def create_refine_count_df_by_models():
    data = list()
    for task in tasks:
        record = dict()
        for model in models:
            refine_percent = sum_up_refine_count_by_models(task, model)
            record[model] = f"{round(refine_percent, 1)}"
        data.append(record)
    df = pd.DataFrame(data, index=list(tasks.keys()))
    return df

In [None]:
df = create_score_df_by_models()
df.to_csv("score_by_models.csv")

In [None]:
df = create_score_df_by_feedbacks()
df.to_csv("score_by_feedbacks.csv")

In [None]:
df = create_refine_count_df_by_models()
df.to_csv("refine_counts.csv")

In [None]:
def fix_evaluation(task, model):
    count = 0
    with (
        open(Path("outputs") / task / models[model][0], mode="r") as g_f,
        open(Path("outputs") / task / models[model][1], mode="r") as e_f,
        open(Path("outputs") / task / (models[model][1] + ".fix"), mode="w") as w_f,
    ):
        for i, (g_l, e_l) in enumerate(zip(g_f, e_f)):
            g_obj = tasks[task][0].model_validate_json(g_l)
            e_obj = tasks[task][1].model_validate_json(e_l)
            if g_obj.best.n != 1 and getattr(g_obj.iterations[0], tasks[task][2]) == getattr(g_obj.best, tasks[task][2]):
                print(f"Found conflict in {task}/{model} ({count + 1}). Line: {i + 1}.", flush=True)
                print(f'A: {getattr(e_obj, tasks[task][2] + "_a")}', flush=True)
                print(f'B: {getattr(e_obj, tasks[task][2] + "_b")}', flush=True)
                print(f"Score A: {e_obj.feedback_a.total_score}", flush=True)
                print(f"Score B: {e_obj.feedback_b.total_score}", flush=True)
                count += 1
                e_obj.feedback_b = e_obj.feedback_a
            print(e_obj.model_dump_json(), file=w_f, flush=True)


def fix_all_evaluation():
    for task in tasks:
        for model in models:
            fix_evaluation(task, model)


def detect_evaluation(task, model):
    count = 0
    with (open(Path("outputs") / task / models[model][1], mode="r") as f,):
        for i, line in enumerate(f):
            obj = tasks[task][1].model_validate_json(line)
            if getattr(obj, tasks[task][2] + "_a") == getattr(obj, tasks[task][2] + "_b") and obj.feedback_a != obj.feedback_b:
                print(f"Found conflict in {task}/{model} ({count + 1}). Line: {i + 1}.", flush=True)
                print(f'A: {getattr(obj, tasks[task][2] + "_a")}', flush=True)
                print(f'B: {getattr(obj, tasks[task][2] + "_b")}', flush=True)
                print(f"Score A: {obj.feedback_a.total_score}", flush=True)
                print(f"Score B: {obj.feedback_b.total_score}", flush=True)
                count += 1


def detect_all_evaluation():
    for task in tasks:
        for model in models:
            detect_evaluation(task, model)

In [None]:
detect_all_evaluation()