In [1]:
from datasets import load_dataset

# We have dataset for meta-eval
dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=8)
dataset = dataset.select(range(30))
dataset = dataset.rename_column("text", "input")  # fastrepl need input column
dataset = dataset.map(
    lambda row: {"reference": row["label"] + 1, "input": row["input"]},
    remove_columns=["label"],
)

for row in dataset:
    print(f"{row['reference']}: {row['input']}")

5: Stayed at the Wm Penn down the street for a wedding and needed a last minute haircut. Friendly, helpful, all around good cut!
4: We tried out the lunch specials and found them to be pretty good.  For about $6 we both got healthy portions of spicy food along with fairly good service.  Take a chance and go for the hotter dishes.  :-)
1: Should have known better than to eat in a place so dark that you can't see the food on your plate. Underwhelming and overpriced.
2: This place has sure changed...and not for the better. they totally re-did the menu and got rid of many yummy things they used to serve.\nThere are still the desserts but the little sliders are off the menu and those were my favorite!\nSo I bring my little guy here for a little dinner & chocolate treat. Our server tells me she is also a mom and whatnot. Ok lady..lets get this this started...I am with a 2 year old. I might be crazy but when the server says she is a parent I expect them to understand & TRY to be quick with th

In [2]:
labels = {
    "FIVE_STARS": "given review is likely to be 5 stars",
    "FOUR_STARS": "given review is likely to be 4 stars",
    "THREE_STARS": "given review is likely to be 3 stars",
    "TWO_STARS": "given review is likely to be 2 stars",
    "ONE_STAR": "given review is likely to be 1 star",
}


def mapper(row):
    label_value_mapping = {
        "FIVE_STARS": 5,
        "FOUR_STARS": 4,
        "THREE_STARS": 3,
        "TWO_STARS": 2,
        "ONE_STAR": 1,
        "UNKNOWN": 0,
    }

    row["prediction"] = label_value_mapping[row["output"]]
    return row

In [3]:
import fastrepl
from fastrepl.run.cache import SQLiteCache

fastrepl.cache = SQLiteCache()

In [4]:
from fastrepl.eval.model import LLMChainOfThoughtClassifier
from fastrepl.loop import Evaluator

# Let's define our first eval
result_1 = Evaluator(
    dataset=dataset,
    evals=[
        LLMChainOfThoughtClassifier(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
            labels=labels,
        )
    ],
    prediction_feature="output",
).run()

result_1 = result_1.map(mapper)
print(result_1, "\n")

for row in result_1:
    print(f"{row['prediction']}: {row['input']}")

Output()

Dataset({
    features: ['input', 'reference', 'output', 'prediction'],
    num_rows: 30
}) 

5: Stayed at the Wm Penn down the street for a wedding and needed a last minute haircut. Friendly, helpful, all around good cut!
0: We tried out the lunch specials and found them to be pretty good.  For about $6 we both got healthy portions of spicy food along with fairly good service.  Take a chance and go for the hotter dishes.  :-)
1: Should have known better than to eat in a place so dark that you can't see the food on your plate. Underwhelming and overpriced.
0: This place has sure changed...and not for the better. they totally re-did the menu and got rid of many yummy things they used to serve.\nThere are still the desserts but the little sliders are off the menu and those were my favorite!\nSo I bring my little guy here for a little dinner & chocolate treat. Our server tells me she is also a mom and whatnot. Ok lady..lets get this this started...I am with a 2 year old. I might be crazy 

In [5]:
# Let's see if we can get more reliable results
from fastrepl.eval.model import LLMChainOfThought, LLMClassifier

result_2 = Evaluator(
    dataset=dataset,
    evals=[
        LLMChainOfThought(
            model="gpt-3.5-turbo",
            labels=labels,
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
        ),
        LLMClassifier(
            model="gpt-3.5-turbo",
            labels=labels,
        ),
    ],
    prediction_feature="output",
).run()


result_2 = result_2.map(mapper)
print(result_2, "\n")

for row in result_1:
    print(f"{row['prediction']}: {row['input']}")

Output()

Dataset({
    features: ['input', 'reference', 'output', 'prediction'],
    num_rows: 30
}) 

5: Stayed at the Wm Penn down the street for a wedding and needed a last minute haircut. Friendly, helpful, all around good cut!
0: We tried out the lunch specials and found them to be pretty good.  For about $6 we both got healthy portions of spicy food along with fairly good service.  Take a chance and go for the hotter dishes.  :-)
1: Should have known better than to eat in a place so dark that you can't see the food on your plate. Underwhelming and overpriced.
0: This place has sure changed...and not for the better. they totally re-did the menu and got rid of many yummy things they used to serve.\nThere are still the desserts but the little sliders are off the menu and those were my favorite!\nSo I bring my little guy here for a little dinner & chocolate treat. Our server tells me she is also a mom and whatnot. Ok lady..lets get this this started...I am with a 2 year old. I might be crazy 

In [6]:
from fastrepl.eval.metric import load_metric

accuracy = load_metric("accuracy")
mse = load_metric("mse")
mae = load_metric("mae")

In [7]:
from fastrepl.eval.metric import load_metric

accuracy = load_metric("accuracy")

print("First experiment's metric compared to ground truth:")
print(
    f"Accuracy: {accuracy.compute(predictions=result_1['prediction'], references=dataset['reference'])['accuracy']}"
)
print(
    f"MSE: {mse.compute(predictions=result_1['prediction'], references=dataset['reference'])['mse']}"
)
print(
    f"MAE: {mae.compute(predictions=result_1['prediction'], references=dataset['reference'])['mae']}"
)

print("\nSecond experiment's metric compared to ground truth:")
print(
    f"Accuracy: {accuracy.compute(predictions=result_2['prediction'], references=dataset['reference'])['accuracy']}"
)
print(
    f"MSE: {mse.compute(predictions=result_2['prediction'], references=dataset['reference'])['mse']}"
)
print(
    f"MAE: {mae.compute(predictions=result_2['prediction'], references=dataset['reference'])['mae']}"
)

First experiment's metric compared to ground truth:
Accuracy: 0.3333333333333333
MSE: 4.933333333333334
MAE: 1.6

Second experiment's metric compared to ground truth:
Accuracy: 0.5
MSE: 1.2666666666666666
MAE: 0.7333333333333333
