In [1]:
%pip install -qq "rich[jupyter]"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import fastrepl
from fastrepl.run.cache import SQLiteCache

fastrepl.cache = None
# fastrepl.cache = SQLiteCache()

In [3]:
from IPython.display import clear_output

In [4]:
from datasets import load_dataset

# We have dataset for meta-eval
dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=8)
dataset = dataset.select(range(30))
dataset = dataset.rename_column("text", "input")
dataset = dataset.map(
    lambda row: {"reference": row["label"] + 1, "input": row["input"]},
    remove_columns=["label"],
)

In [5]:
labels = {
    "FIVE_STARS": "given review is likely to be 5 stars",
    "FOUR_STARS": "given review is likely to be 4 stars",
    "THREE_STARS": "given review is likely to be 3 stars",
    "TWO_STARS": "given review is likely to be 2 stars",
    "ONE_STAR": "given review is likely to be 1 star",
}


def label_to_score(label):
    mapping = {
        "FIVE_STARS": 5,
        "FOUR_STARS": 4,
        "THREE_STARS": 3,
        "TWO_STARS": 2,
        "ONE_STAR": 1,
        "UNKNOWN": 0,
    }
    return mapping[label]

In [6]:
from fastrepl.eval.metric import load_metric


def print_metric(metric_name, predictions, references):
    metric = load_metric(metric_name)
    result = metric.compute(predictions=predictions, references=references)
    print(f"{metric_name}: {result[metric_name]}")

In [7]:
clear_output(wait=True)

from fastrepl.eval.model import LLMChainOfThoughtClassifier
from fastrepl.eval import Evaluator
from fastrepl.loop import REPL

# Let's define our first eval
eval = Evaluator(
    pipeline=[
        LLMChainOfThoughtClassifier(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
            labels=labels,
        )
    ]
)

with REPL() as repl:
    repl.set_evaluator(eval)

    _predictions = repl.eval(dataset["input"])
    predictions = [label_to_score(label) for label in _predictions]

    print_metric("accuracy", predictions, dataset["reference"])
    print_metric("mse", predictions, dataset["reference"])
    print_metric("mae", predictions, dataset["reference"])

Output()

accuracy: 0.3333333333333333
mse: 5.633333333333334
mae: 1.7


In [8]:
clear_output(wait=True)

# Let's see if we can get more reliable results
from fastrepl.eval.model import LLMChainOfThought, LLMClassifier

eval = Evaluator(
    pipeline=[
        LLMChainOfThought(
            model="gpt-3.5-turbo",
            labels=labels,
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
        ),
        LLMClassifier(
            model="gpt-3.5-turbo",
            labels=labels,
        ),
    ]
)

with REPL() as repl:
    repl.set_evaluator(eval)

    _predictions = repl.eval(dataset["input"])
    predictions = [label_to_score(label) for label in _predictions]

    print_metric("accuracy", predictions, dataset["reference"])
    print_metric("mse", predictions, dataset["reference"])
    print_metric("mae", predictions, dataset["reference"])

Output()

accuracy: 0.5666666666666667
mse: 1.4
mae: 0.7333333333333333
