# Big Bench

In [None]:
!pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz" -q

In [None]:
!pip install dspy datasets python-Levenshtein -q

In [98]:
import os
import pandas as pd
from datasets import load_dataset
import dspy
import Levenshtein
import random
from openai import OpenAI
client = OpenAI()

In [262]:
ds = load_dataset("google/bigbench", "language_games")["default"]
examples = [dspy.Example({"question": r["inputs"], "options": r["multiple_choice_targets"], "answer":r['targets']}).with_inputs("question","options") for r in ds]
print(f"There are {len(examples)} examples.")
trainset = random.sample(examples, 3)
valset = random.sample([i for i in examples if i not in trainset],200)

There are 2128 examples.


In [276]:
examples[74]

Example({'question': 'Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice -> iceyay); if it starts with consonants, you move them to end of the word and add -ay (prank -> ankpray); if it is capitalized, the Pig Latin word is capitalized as well (Maggie -> Aggiemay). Translate Pig Latin into English.\n (Pig Latin) Ethay illbay atyay astlay entway oughthray.\n (English)', 'options': [], 'answer': ['The bill at last went through.']}) (input_keys={'question', 'options'})

In [277]:
for i,j in dict(examples[74]).items():
  print(i,j)
  print('--'*3)
  break

question Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice -> iceyay); if it starts with consonants, you move them to end of the word and add -ay (prank -> ankpray); if it is capitalized, the Pig Latin word is capitalized as well (Maggie -> Aggiemay). Translate Pig Latin into English.
 (Pig Latin) Ethay illbay atyay astlay entway oughthray.
 (English)
------


In [278]:
lm = dspy.OpenAI(model="gpt-4o-2024-05-13")
dspy.settings.configure(lm=lm)

In [279]:
def custom_format_handler(value):
    if isinstance(value, list):
        return ' | '.join(map(str, value))
        #return value[-1]
    return value

In [294]:
class CoTSignature(dspy.Signature):
    """Play a language game. Only produce the answer."""
    question = dspy.InputField()
    #options = dspy.InputField(format=custom_format_handler)
    answer = dspy.OutputField(format=custom_format_handler)

In [295]:
simple_predict = dspy.Predict(CoTSignature)

In [296]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.signature = CoTSignature
        self.predictor = dspy.ChainOfThought(self.signature)
    def forward(self, question, options):
      result = self.predictor(lm=lm,question=question, options=options)
      return dspy.Prediction(answer=result.answer)

In [297]:
cot_module = CoT()
test_q = examples[74]
prediction = cot_module.forward(question=test_q.question, options=test_q.options)
print(f"Sense: {prediction.answer}")

Sense: The bill at last went through.


In [298]:
simple_predict(question=test_q.question, options=test_q.options)

Prediction(
    answer='The bill at last went through.'
)

In [299]:
def wsd_metric(example, pred, trace=None, threshold=0.90):
    """Metric function for word sense disambiguation with Levenshtein distance."""
    gold_sense = example.answer[0]
    predicted_sense = pred.answer

    # Calculate Levenshtein similarity
    similarity = Levenshtein.ratio(gold_sense.lower(), predicted_sense.lower())

    # Check if similarity exceeds the threshold
    return int(similarity >= threshold)


In [300]:
from dspy.teleprompt import BootstrapFewShot

# Set up the optimizer: we want to "bootstrap" (i.e., self-generate) 4-shot examples of our CoT program.
config = dict(max_bootstrapped_demos=0, max_labeled_demos=3)

# Optimize! Use the `gsm8k_metric` here. In general, the metric is going to tell the optimizer how well it's doing.
teleprompter = BootstrapFewShot(metric=wsd_metric, **config)
optimized_cot = teleprompter.compile(CoT(), trainset=trainset)

  0%|          | 0/3 [00:00<?, ?it/s]

Bootstrapped 0 full traces after 1 examples in round 0.





In [301]:
optimized_cot.forward(question=test_q.question, options=test_q.options)

Prediction(
    answer='The bill at last went through.'
)

In [302]:
lm.inspect_history(1)





Play a language game. Only produce the answer.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: ${answer}

---

Question: Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice -> iceyay); if it starts with consonants, you move them to end of the word and add -ay (prank -> ankpray); if it is capitalized, the Pig Latin word is capitalized as well (Maggie -> Aggiemay). Translate Pig Latin into English. (Pig Latin) Omtay entway erserkbay. (English)
Answer: Tom went berserk.

Question: Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice -> iceyay); if it starts with consonants, you move them to end of the word and add -ay (prank -> ankpray); if it is capitalized, the Pig Latin word is capitalized as well (Maggie -> Aggiemay). Translate Pig Latin into English. (Pig

In [303]:
examples[74]['answer']

['The bill at last went through.']

In [306]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(devset=valset, num_threads=2, display_progress=True, display_table=20,return_outputs=True)

# Launch evaluation.
evaluation_score, outputs = evaluator(optimized_cot, metric=wsd_metric)

Average Metric: 191 / 200  (95.5): 100%|██████████| 200/200 [02:29<00:00,  1.34it/s]

Average Metric: 191 / 200  (95.5%)





Unnamed: 0,question,options,example_answer,pred_answer,wsd_metric
0,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['He visited his granny.'],He visited his granny.,1
1,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Aylalay ivedlay ayay oodgay ifelay.'],Aylalay ivedlay ayay oodgay ifelay.,1
2,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Omtay esignedray isthay eekway.'],Omtay esignedray isthay eekway.,1
3,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Ouyay ooklay eengray.'],Ouyay ooklay eengray.,1
4,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Everyoneyay entway otay Australiayay exceptyay Omtay.'],Everyoneyay entway otay Australiayay exceptyay Omtay.,1
5,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Eway avehay ayay ittlelay imetay eforebay ethay extnay aintray.'],Eway avehay ayay ittlelay imetay eforebay ethay extnay aintray.,1
6,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Tom is trying to figure that out right now.'],Tom is trying to figure that out right now.,1
7,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Tom might be in trouble.'],Tom might be in trouble.,1
8,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Eshay advisedyay imhay otnay otay endspay allyay ishay oneymay onyay ishay irlfriendgay.'],Eshay advisedyay imhay otnay otay endspay allyay ishay oneymay onyay ishay irlfriendgay.,1
9,"Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice...",[],['Eshay ikeslay otay umiliatehay eoplepay.'],Eshay ikeslay otay umiliatehay eoplepay.,1


In [307]:
evaluation_score

95.5

In [308]:
outputs = pd.DataFrame(outputs)

In [309]:
outputs.columns = ['problem','response','result']

In [310]:
outputs.iloc[95,0]

Example({'question': 'Pig Latin looks just like English, but if an English word starts with a vowel you add -yay at the end of the word (ice -> iceyay); if it starts with consonants, you move them to end of the word and add -ay (prank -> ankpray); if it is capitalized, the Pig Latin word is capitalized as well (Maggie -> Aggiemay). Translate English into Pig Latin.\n (English) Do you really want to wait for Tom?\n (Pig Latin)', 'options': [], 'answer': ['Oday ouyay eallyray antway otay aitway orfay Omtay?']}) (input_keys={'question', 'options'})

In [311]:
outputs.to_json('language_games_GPT_4o_3_shot_results.json',orient='records')