In [1]:
import fastrepl.repl as fastrepl

fastrepl.LLMCache.disable()
fastrepl.DEBUG(0)

In [2]:
from IPython.display import clear_output

In [3]:
from datasets import load_dataset

# We have dataset for meta-eval
dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=8)
dataset = dataset.select(range(30))
dataset = dataset.rename_column("text", "input")
dataset = dataset.map(
    lambda row: {"reference": row["label"] + 1, "input": row["input"]},
    remove_columns=["label"],
)

In [10]:
labels = {
    "FIVE_STARS": "given review is likely to be 5 stars",
    "FOUR_STARS": "given review is likely to be 4 stars",
    "THREE_STARS": "given review is likely to be 3 stars",
    "TWO_STARS": "given review is likely to be 2 stars",
    "ONE_STAR": "given review is likely to be 1 star",
}


def label_to_score(example):
    label = example["prediction"]
    if label is None:
        example["prediction"] = 0
    else:
        example["prediction"] = {
            "FIVE_STARS": 5,
            "FOUR_STARS": 4,
            "THREE_STARS": 3,
            "TWO_STARS": 2,
            "ONE_STAR": 1,
        }[label]

    return example

In [5]:
def print_metric(metric_name, predictions, references):
    metric = fastrepl.load_metric(metric_name)
    result = metric.compute(predictions=predictions, references=references)
    print(f"{metric_name}: {result[metric_name]}")

In [9]:
clear_output(wait=True)

# Let's define our first eval
eval = fastrepl.Evaluator(
    pipeline=[
        fastrepl.LLMChainOfThoughtClassifier(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
            labels=labels,
        )
    ]
)

result1 = fastrepl.LocalRunner(evaluator=eval, dataset=dataset).run()
result1 = result1.map(label_to_score)

print_metric("accuracy", result1["prediction"], result1["reference"])
print_metric("mse", result1["prediction"], result1["reference"])
print_metric("mae", result1["prediction"], result1["reference"])

result1.to_pandas()

Output()

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

accuracy: 0.5666666666666667
mse: 0.43333333333333335
mae: 0.43333333333333335


Unnamed: 0,input,reference,prediction
0,Stayed at the Wm Penn down the street for a we...,5,5
1,We tried out the lunch specials and found them...,4,4
2,Should have known better than to eat in a plac...,1,1
3,This place has sure changed...and not for the ...,2,2
4,I've been bringing my son to the owner Michael...,5,5
5,The trip to the location takes two busses and ...,1,2
6,This restaurant was suggested to me by a frien...,4,5
7,This place is unique because you are sitting o...,3,3
8,"Whew, $10 for a whiskey ginger ale and $18 for...",3,4
9,"Loove me some Pei Wei! Boyfriend isn't a fan, ...",4,4


In [13]:
clear_output(wait=True)

# Let's see if we can get more reliable results
eval = fastrepl.Evaluator(
    pipeline=[
        fastrepl.LLMChainOfThought(
            model="gpt-3.5-turbo",
            labels=labels,
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
        ),
        fastrepl.LLMClassifier(
            model="gpt-3.5-turbo",
            labels=labels,
        ),
    ]
)

result2 = fastrepl.LocalRunner(evaluator=eval, dataset=dataset).run()
result2 = result2.map(label_to_score)

print_metric("accuracy", result2["prediction"], result2["reference"])
print_metric("mse", result2["prediction"], result2["reference"])
print_metric("mae", result2["prediction"], result2["reference"])

result2.to_pandas()

Output()

2023-08-31 10:17:53,517 - 11390955520 - _common.py-_common:105 - INFO: Backing off completion(...) for 0.3s (fastrepl.llm.RetryConstantException)


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

accuracy: 0.4666666666666667
mse: 0.8
mae: 0.6


Unnamed: 0,input,reference,prediction
0,Stayed at the Wm Penn down the street for a we...,5,5
1,We tried out the lunch specials and found them...,4,4
2,Should have known better than to eat in a plac...,1,1
3,This place has sure changed...and not for the ...,2,2
4,I've been bringing my son to the owner Michael...,5,5
5,The trip to the location takes two busses and ...,1,1
6,This restaurant was suggested to me by a frien...,4,5
7,This place is unique because you are sitting o...,3,3
8,"Whew, $10 for a whiskey ginger ale and $18 for...",3,3
9,"Loove me some Pei Wei! Boyfriend isn't a fan, ...",4,5
