In [1]:
import fastrepl.repl as fastrepl

fastrepl.LLMCache.disable()
fastrepl.DEBUG(0)

In [2]:
from IPython.display import clear_output

In [3]:
from datasets import load_dataset

# We have dataset for meta-eval
dataset = load_dataset("yelp_review_full", split="test")
dataset = dataset.shuffle(seed=8)
dataset = dataset.select(range(30))
dataset = dataset.rename_column("text", "input")
dataset = dataset.map(
    lambda row: {"reference": row["label"] + 1, "input": row["input"]},
    remove_columns=["label"],
)

In [4]:
def to_number(example):
    if example["prediction"] is None:
        print("None detected, setting to 0")
        example["prediction"] = 0
    example["prediction"] = float(example["prediction"])
    return example


def print_metric(metric_name, predictions, references):
    metric = fastrepl.load_metric(metric_name)
    result = metric.compute(predictions=predictions, references=references)
    print(f"{metric_name}: {result[metric_name]}")

In [5]:
clear_output(wait=True)

eval1 = fastrepl.Evaluator(
    pipeline=[
        fastrepl.LLMGradingHead(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Grade user's satisfaction from 1 to 5.",
            number_from=1,
            number_to=5,
            position_debias_strategy="shuffle",
        )
    ]
)

result1 = fastrepl.LocalRunner(evaluator=eval1, dataset=dataset).run()
result1 = result1.map(to_number)

print_metric("accuracy", result1["prediction"], result1["reference"])
print_metric("mse", result1["prediction"], result1["reference"])
print_metric("mae", result1["prediction"], result1["reference"])

result1.to_pandas()

Output()

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

accuracy: 0.36666666666666664
mse: 1.6333333333333333
mae: 0.9


Unnamed: 0,input,reference,prediction
0,Stayed at the Wm Penn down the street for a we...,5,5.0
1,We tried out the lunch specials and found them...,4,5.0
2,Should have known better than to eat in a plac...,1,1.0
3,This place has sure changed...and not for the ...,2,1.0
4,I've been bringing my son to the owner Michael...,5,5.0
5,The trip to the location takes two busses and ...,1,2.0
6,This restaurant was suggested to me by a frien...,4,1.0
7,This place is unique because you are sitting o...,3,4.0
8,"Whew, $10 for a whiskey ginger ale and $18 for...",3,3.0
9,"Loove me some Pei Wei! Boyfriend isn't a fan, ...",4,1.0


In [6]:
clear_output(wait=True)

# simple references + COT
eval2 = fastrepl.Evaluator(
    pipeline=[
        fastrepl.LLMGradingHead(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Grade user's satisfaction from 1 to 5.",
            number_from=1,
            number_to=5,
            position_debias_strategy="shuffle",
            references=[("this is the best!", "5"), ("this is the worst!", "1")],
        )
    ]
)

result2 = fastrepl.LocalRunner(evaluator=eval2, dataset=dataset).run()
result2 = result2.map(to_number)

print_metric("accuracy", result2["prediction"], result2["reference"])
print_metric("mse", result2["prediction"], result2["reference"])
print_metric("mae", result2["prediction"], result2["reference"])

result2.to_pandas()

Output()

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

accuracy: 0.6333333333333333
mse: 0.36666666666666664
mae: 0.36666666666666664


Unnamed: 0,input,reference,prediction
0,Stayed at the Wm Penn down the street for a we...,5,4.0
1,We tried out the lunch specials and found them...,4,4.0
2,Should have known better than to eat in a plac...,1,2.0
3,This place has sure changed...and not for the ...,2,2.0
4,I've been bringing my son to the owner Michael...,5,5.0
5,The trip to the location takes two busses and ...,1,1.0
6,This restaurant was suggested to me by a frien...,4,4.0
7,This place is unique because you are sitting o...,3,3.0
8,"Whew, $10 for a whiskey ginger ale and $18 for...",3,3.0
9,"Loove me some Pei Wei! Boyfriend isn't a fan, ...",4,4.0


In [7]:
clear_output(wait=True)

eval3 = fastrepl.Evaluator(
    pipeline=[
        fastrepl.LLMGradingHeadCOT(
            model="gpt-3.5-turbo",
            context="You will get a input text from Yelp review. Grade user's satisfaction from 1 to 5.",
            number_from=1,
            number_to=5,
        )
    ]
)

result3 = fastrepl.LocalRunner(evaluator=eval3, dataset=dataset).run()
result3 = result3.map(to_number)

print_metric("accuracy", result3["prediction"], result3["reference"])
print_metric("mse", result3["prediction"], result3["reference"])
print_metric("mae", result3["prediction"], result3["reference"])

result3.to_pandas()

Output()

2023-09-04 11:12:05,987 - 11417366528 - _common.py-_common:105 - INFO: Backing off completion(...) for 1.5s (fastrepl.llm.RetryConstantException)


2023-09-04 11:12:14,681 - 11299581952 - _common.py-_common:105 - INFO: Backing off completion(...) for 2.0s (fastrepl.llm.RetryConstantException)


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

None detected, setting to 0
None detected, setting to 0
accuracy: 0.6
mse: 0.5
mae: 0.43333333333333335


Unnamed: 0,input,reference,prediction
0,Stayed at the Wm Penn down the street for a we...,5,4.0
1,We tried out the lunch specials and found them...,4,4.0
2,Should have known better than to eat in a plac...,1,1.0
3,This place has sure changed...and not for the ...,2,2.0
4,I've been bringing my son to the owner Michael...,5,5.0
5,The trip to the location takes two busses and ...,1,1.0
6,This restaurant was suggested to me by a frien...,4,5.0
7,This place is unique because you are sitting o...,3,3.0
8,"Whew, $10 for a whiskey ginger ale and $18 for...",3,3.0
9,"Loove me some Pei Wei! Boyfriend isn't a fan, ...",4,5.0
