In [10]:
%pip install -qq "rich[jupyter]"

Note: you may need to restart the kernel to use updated packages.


In [11]:
from IPython.display import clear_output

In [12]:
from datasets import load_dataset

# We have dataset for meta-eval
dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=8)
dataset = dataset.select(range(100))
dataset = dataset.rename_column("text", "input")  # fastrepl need input column
dataset = dataset.map(
    lambda row: {"reference": row["label"] + 1, "input": row["input"]},
    remove_columns=["label"],
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
labels = {
    "FIVE_STARS": "given review is likely to be 5 stars",
    "FOUR_STARS": "given review is likely to be 4 stars",
    "THREE_STARS": "given review is likely to be 3 stars",
    "TWO_STARS": "given review is likely to be 2 stars",
    "ONE_STAR": "given review is likely to be 1 star",
}

In [14]:
import fastrepl
from fastrepl.run.cache import SQLiteCache

# fastrepl.cache = None
fastrepl.cache = SQLiteCache()

In [15]:
clear_output(wait=True)

from fastrepl.eval.model import LLMChainOfThoughtClassifier
from fastrepl.loop import Evaluator

# Let's define our first eval
result_1 = Evaluator(
    dataset=dataset,
    pipeline=[
        LLMChainOfThoughtClassifier(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
            labels=labels,
        )
    ],
    prediction_feature="output",
).run()

result_1

Output()

Dataset({
    features: ['input', 'reference', 'output'],
    num_rows: 100
})

In [16]:
clear_output(wait=True)

# Let's see if we can get more reliable results
from fastrepl.eval.model import LLMChainOfThought, LLMClassifier

result_2 = Evaluator(
    dataset=dataset,
    pipeline=[
        LLMChainOfThought(
            model="gpt-3.5-turbo",
            labels=labels,
            context="You will get a input text from Yelp review. Please rate it from 1 to 5 stars.",
        ),
        LLMClassifier(
            model="gpt-3.5-turbo",
            labels=labels,
        ),
    ],
    prediction_feature="output",
).run()

result_2

Output()

Dataset({
    features: ['input', 'reference', 'output'],
    num_rows: 100
})

In [17]:
def mapper(row):
    label_value_mapping = {
        "FIVE_STARS": 5,
        "FOUR_STARS": 4,
        "THREE_STARS": 3,
        "TWO_STARS": 2,
        "ONE_STAR": 1,
        "UNKNOWN": 0,
    }

    row["prediction"] = label_value_mapping[row["output"]]
    return row


result_1 = result_1.map(mapper)
result_2 = result_2.map(mapper)

result_1, result_2

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

(Dataset({
     features: ['input', 'reference', 'output', 'prediction'],
     num_rows: 100
 }),
 Dataset({
     features: ['input', 'reference', 'output', 'prediction'],
     num_rows: 100
 }))

In [18]:
from fastrepl.eval.metric import load_metric


def print_metric(ds, metric_name):
    metric = load_metric(metric_name)
    result = metric.compute(
        predictions=ds["prediction"], references=dataset["reference"]
    )
    print(f"{metric_name}: {result[metric_name]}")


print("First experiment's metric compared to ground truth:")
print_metric(result_1, "accuracy")
print_metric(result_1, "mse")
print_metric(result_1, "mae")

print("\nSecond experiment's metric compared to ground truth:")
print_metric(result_2, "accuracy")
print_metric(result_2, "mse")
print_metric(result_2, "mae")

First experiment's metric compared to ground truth:
accuracy: 0.33
mse: 5.55
mae: 1.69

Second experiment's metric compared to ground truth:
accuracy: 0.39
mse: 0.91
mae: 0.71
