In [1]:
from datasets import load_dataset

# We have dataset for meta-eval
dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=8)
dataset = dataset.select(range(20))
dataset = dataset.rename_column("text", "input")  # fastrepl need input column
dataset = dataset.map(
    lambda row: {"stars": row["label"] + 1, "input": row["input"]},
    remove_columns=["label"],
)


def render_row(row, label_key: str, text_key: str):
    stars, text = row[label_key], row[text_key]
    return f"[{stars} stars]: {text}"


for row in dataset:
    print(render_row(row, label_key="stars", text_key="input"))

[5 stars]: Stayed at the Wm Penn down the street for a wedding and needed a last minute haircut. Friendly, helpful, all around good cut!
[4 stars]: We tried out the lunch specials and found them to be pretty good.  For about $6 we both got healthy portions of spicy food along with fairly good service.  Take a chance and go for the hotter dishes.  :-)
[1 stars]: Should have known better than to eat in a place so dark that you can't see the food on your plate. Underwhelming and overpriced.
[2 stars]: This place has sure changed...and not for the better. they totally re-did the menu and got rid of many yummy things they used to serve.\nThere are still the desserts but the little sliders are off the menu and those were my favorite!\nSo I bring my little guy here for a little dinner & chocolate treat. Our server tells me she is also a mom and whatnot. Ok lady..lets get this this started...I am with a 2 year old. I might be crazy but when the server says she is a parent I expect them to unde

In [2]:
from fastrepl.eval.model import LLMChainOfThoughtClassifier
from fastrepl.loop import Evaluator

# Let's define our first eval
result_1 = Evaluator(
    dataset=dataset,
    evals=[
        LLMChainOfThoughtClassifier(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
            labels={
                "FIVE_STARS": "5 stars",
                "FOUR_STARS": "4 stars",
                "THREE_STARS": "3 stars",
                "TWO_STARS": "2 stars",
                "ONE_STAR": "1 star",
            },
        )
    ],
).run()

print(result_1, "\n")


def mapper(row):
    return {
        "input": row["input"],
        "output": {
            "FIVE_STARS": 5,
            "FOUR_STARS": 4,
            "THREE_STARS": 3,
            "TWO_STARS": 2,
            "ONE_STAR": 1,
            "UNKNOWN": 0,
        }[row["output"]],
        "stars": row["stars"],
    }


result_1 = result_1.map(mapper)


for row in result_1:
    print(render_row(row, label_key="output", text_key="input"))

Output()

Dataset({
    features: ['input', 'stars', 'output'],
    num_rows: 20
}) 



Map:   0%|          | 0/20 [00:00<?, ? examples/s]

[5 stars]: Stayed at the Wm Penn down the street for a wedding and needed a last minute haircut. Friendly, helpful, all around good cut!
[0 stars]: We tried out the lunch specials and found them to be pretty good.  For about $6 we both got healthy portions of spicy food along with fairly good service.  Take a chance and go for the hotter dishes.  :-)
[0 stars]: Should have known better than to eat in a place so dark that you can't see the food on your plate. Underwhelming and overpriced.
[0 stars]: This place has sure changed...and not for the better. they totally re-did the menu and got rid of many yummy things they used to serve.\nThere are still the desserts but the little sliders are off the menu and those were my favorite!\nSo I bring my little guy here for a little dinner & chocolate treat. Our server tells me she is also a mom and whatnot. Ok lady..lets get this this started...I am with a 2 year old. I might be crazy but when the server says she is a parent I expect them to unde

In [3]:
# Let's see if we can get more reliable results
from fastrepl.eval.model import LLMChainOfThought, LLMClassifier

labels = {
    "FIVE_STARS": "5 stars",
    "FOUR_STARS": "4 stars",
    "THREE_STARS": "3 stars",
    "TWO_STARS": "2 stars",
    "ONE_STAR": "1 star",
}


result_2 = Evaluator(
    dataset=dataset,
    evals=[
        LLMChainOfThought(
            model="gpt-3.5-turbo",
            labels=labels,
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
        ),
        LLMClassifier(
            model="gpt-3.5-turbo",
            labels=labels,
        ),
    ],
).run()


print(result_2, "\n")


def mapper(row):
    return {
        "input": row["input"],
        "output": {
            "FIVE_STARS": 5,
            "FOUR_STARS": 4,
            "THREE_STARS": 3,
            "TWO_STARS": 2,
            "ONE_STAR": 1,
            "UNKNOWN": 0,
        }[row["output"]],
        "stars": row["stars"],
    }


result_2 = result_2.map(mapper)

for row in result_2:
    print(render_row(row, label_key="output", text_key="input"))

Output()

Dataset({
    features: ['input', 'stars', 'output'],
    num_rows: 20
}) 



Map:   0%|          | 0/20 [00:00<?, ? examples/s]

[5 stars]: Stayed at the Wm Penn down the street for a wedding and needed a last minute haircut. Friendly, helpful, all around good cut!
[4 stars]: We tried out the lunch specials and found them to be pretty good.  For about $6 we both got healthy portions of spicy food along with fairly good service.  Take a chance and go for the hotter dishes.  :-)
[2 stars]: Should have known better than to eat in a place so dark that you can't see the food on your plate. Underwhelming and overpriced.
[2 stars]: This place has sure changed...and not for the better. they totally re-did the menu and got rid of many yummy things they used to serve.\nThere are still the desserts but the little sliders are off the menu and those were my favorite!\nSo I bring my little guy here for a little dinner & chocolate treat. Our server tells me she is also a mom and whatnot. Ok lady..lets get this this started...I am with a 2 year old. I might be crazy but when the server says she is a parent I expect them to unde

In [4]:
eval1_predictions = result_1["output"]
eval2_predictions = result_2["output"]

In [5]:
eval1_predictions, eval2_predictions

([5, 0, 0, 0, 5, 1, 0, 0, 3, 0, 0, 5, 4, 1, 0, 5, 0, 0, 5, 0],
 [5, 4, 2, 2, 5, 1, 4, 3, 4, 4, 5, 5, 4, 1, 1, 5, 5, 4, 5, 4])

In [6]:
from fastrepl.eval.metric import load_metric

accuracy = load_metric("accuracy")

In [7]:
references = dataset["stars"]

print(1, accuracy.compute(predictions=eval1_predictions, references=references))
print(2, accuracy.compute(predictions=eval2_predictions, references=references))

1 {'accuracy': 0.35}
2 {'accuracy': 0.7}
