In [1]:
import fastrepl.repl as fastrepl

# fastrepl.llm_cache = None
fastrepllm_cache = fastrepl.cache.SQLiteCache()

In [2]:
from IPython.display import clear_output

In [3]:
from datasets import load_dataset

# We have dataset for meta-eval
dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=8)
dataset = dataset.select(range(30))
dataset = dataset.rename_column("text", "input")
dataset = dataset.map(
    lambda row: {"reference": row["label"] + 1, "input": row["input"]},
    remove_columns=["label"],
)

In [4]:
labels = {
    "FIVE_STARS": "given review is likely to be 5 stars",
    "FOUR_STARS": "given review is likely to be 4 stars",
    "THREE_STARS": "given review is likely to be 3 stars",
    "TWO_STARS": "given review is likely to be 2 stars",
    "ONE_STAR": "given review is likely to be 1 star",
}


def label_to_score(example):
    label = example["prediction"]
    example["prediction"] = {
        "FIVE_STARS": 5,
        "FOUR_STARS": 4,
        "THREE_STARS": 3,
        "TWO_STARS": 2,
        "ONE_STAR": 1,
        "UNKNOWN": 0,
    }[label]
    return example

In [5]:
def print_metric(metric_name, predictions, references):
    metric = fastrepl.load_metric(metric_name)
    result = metric.compute(predictions=predictions, references=references)
    print(f"{metric_name}: {result[metric_name]}")

In [8]:
clear_output(wait=True)

# Let's define our first eval
eval = fastrepl.Evaluator(
    pipeline=[
        fastrepl.LLMChainOfThoughtClassifier(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
            labels=labels,
        )
    ]
)

result = fastrepl.LocalRunner(evaluator=eval, dataset=dataset).run()
result = result.map(label_to_score)

print_metric("accuracy", result["prediction"], result["reference"])
print_metric("mse", result["prediction"], result["reference"])
print_metric("mae", result["prediction"], result["reference"])

Output()

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

accuracy: 0.26666666666666666
mse: 4.866666666666666
mae: 1.6666666666666667


In [9]:
clear_output(wait=True)

# Let's see if we can get more reliable results
eval = fastrepl.Evaluator(
    pipeline=[
        fastrepl.LLMChainOfThought(
            model="gpt-3.5-turbo",
            labels=labels,
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
        ),
        fastrepl.LLMClassifier(
            model="gpt-3.5-turbo",
            labels=labels,
        ),
    ]
)

result = fastrepl.LocalRunner(evaluator=eval, dataset=dataset).run()
result = result.map(label_to_score)

print_metric("accuracy", result["prediction"], result["reference"])
print_metric("mse", result["prediction"], result["reference"])
print_metric("mae", result["prediction"], result["reference"])

Output()

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

accuracy: 0.5333333333333333
mse: 0.5666666666666667
mae: 0.5
