# Big Bench

In [None]:
!pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz" -q

In [None]:
!pip install dspy datasets python-Levenshtein -q

In [98]:
import os
import pandas as pd
from datasets import load_dataset
import dspy
import Levenshtein
import random
from openai import OpenAI
client = OpenAI()

In [358]:
ds = load_dataset("google/bigbench", "conceptual_combinations")["default"]
examples = [dspy.Example({"question": r["inputs"], "options": r["multiple_choice_targets"], "answer":r['targets']}).with_inputs("question","options") for r in ds]
print(f"There are {len(examples)} examples.")
trainset = random.sample(examples, 3)
valset = [i for i in examples if i not in trainset]

There are 103 examples.


In [359]:
examples[1]

Example({'question': "The word 'mimsy' is an adjective for things that are very fragile. Question: Which of the following sentences best characterizes mimsy bulls?\n option: Mimsy bulls are very popular in rodeos.\n option: Mimsy bulls make great pets.\n option: Mimsy bulls are typically very young or very ill.\n option: Mimsy bulls break into a thousand pieces when dropped.\nAnswer:", 'options': ['Mimsy bulls are typically very young or very ill.', 'Mimsy bulls are very popular in rodeos.', 'Mimsy bulls make great pets.', 'Mimsy bulls break into a thousand pieces when dropped.'], 'answer': ['Mimsy bulls are typically very young or very ill.']}) (input_keys={'question', 'options'})

In [360]:
lm = dspy.OpenAI(model="gpt-4o-2024-05-13")
dspy.settings.configure(lm=lm)

In [361]:
def custom_format_handler(value):
    if isinstance(value, list):
        return ' | '.join(map(str, value))
        #return value[-1]
    return value

In [364]:
class CoTSignature(dspy.Signature):
    """Only produce the answer."""
    question = dspy.InputField()
    options = dspy.InputField(format=custom_format_handler)
    answer = dspy.OutputField(format=custom_format_handler)

In [365]:
simple_predict = dspy.Predict(CoTSignature)

In [366]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.signature = CoTSignature
        self.predictor = dspy.ChainOfThought(self.signature)
    def forward(self, question, options):
      result = self.predictor(lm=lm,question=question, options=options)
      return dspy.Prediction(answer=result.answer)

In [367]:
cot_module = CoT()
test_q = examples[74]
prediction = cot_module.forward(question=test_q.question, options=test_q.options)
print(f"Sense: {prediction.answer}")

Sense: Flags hang on suphe poles.


In [368]:
simple_predict(question=test_q.question, options=test_q.options)

Prediction(
    answer='Flags hang on suphe poles.'
)

In [369]:
def wsd_metric(example, pred, trace=None, threshold=0.90):
    """Metric function for word sense disambiguation with Levenshtein distance."""
    gold_sense = example.answer[0]
    predicted_sense = pred.answer

    # Calculate Levenshtein similarity
    similarity = Levenshtein.ratio(gold_sense.lower(), predicted_sense.lower())

    # Check if similarity exceeds the threshold
    return int(similarity >= threshold)


In [370]:
from dspy.teleprompt import BootstrapFewShot

# Set up the optimizer: we want to "bootstrap" (i.e., self-generate) 4-shot examples of our CoT program.
config = dict(max_bootstrapped_demos=0, max_labeled_demos=3)

# Optimize! Use the `gsm8k_metric` here. In general, the metric is going to tell the optimizer how well it's doing.
teleprompter = BootstrapFewShot(metric=wsd_metric, **config)
optimized_cot = teleprompter.compile(CoT(), trainset=trainset)

  0%|          | 0/3 [00:00<?, ?it/s]

Bootstrapped 0 full traces after 1 examples in round 0.





In [371]:
optimized_cot.forward(question=test_q.question, options=test_q.options)

Prediction(
    answer='Flags hang on suphe poles.'
)

In [372]:
lm.inspect_history(1)





Only produce the answer.

---

Question: Concept: boulder-sized diamond. Question: Which of the following sentences best characterizes boulder-sized diamonds? option: Boulder-sized diamonds are diamond-shaped rocks. option: Boulder-sized diamonds would be extremely valuable. option: Boulder-sized diamonds are desirable in engagement rings. option: Boulder-sized diamonds are found on ski hills. Answer:
Options: Boulder-sized diamonds would be extremely valuable. | Boulder-sized diamonds are desirable in engagement rings. | Boulder-sized diamonds are diamond-shaped rocks. | Boulder-sized diamonds are found on ski hills.
Answer: Boulder-sized diamonds would be extremely valuable.

Question: Concept: dirty bowls. Question: Which of the following sentences best characterizes dirty bowls? option: Dirty bowls are delicious. option: Dirty bowls are sticky. option: Dirty bowls are classy. option: Dirty bowls are rusty. Answer:
Options: Dirty bowls are sticky. | Dirty bowls are rusty. | Dirt

In [373]:
examples[74]['answer']

['Flags hang on suphe poles.']

In [374]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(devset=valset, num_threads=2, display_progress=True, display_table=20,return_outputs=True)

# Launch evaluation.
evaluation_score, outputs = evaluator(optimized_cot, metric=wsd_metric)

Average Metric: 97 / 100  (97.0): 100%|██████████| 100/100 [01:09<00:00,  1.45it/s]

Average Metric: 97 / 100  (97.0%)





Unnamed: 0,question,options,example_answer,pred_answer,wsd_metric
0,Concept: Colourless red surfaces. Question: Which of the following sentences best characterizes colourless red surfaces? option: Colourless red surfaces are good tabletops for kitchens. option:...,"['Colourless red surfaces cannot exist.', 'Colourless red surfaces are both red and green.', 'Colourless red surfaces are good tabletops for kitchens.', 'Colourless red surfaces are...",['Colourless red surfaces cannot exist.'],Colourless red surfaces cannot exist.,1
1,Concept: Sinless sinner. Question: Which of the following sentences best characterizes sinless sinners? option: Sinless sinners are letters of the alphabet. option: Sinless sinners have...,"['No one can be a sinless sinner.', 'Sinless sinners have no clue.', 'Sinless sinners are letters of the alphabet.', 'Sinless sinners can be venial or...",['No one can be a sinless sinner.'],No one can be a sinless sinner.,1
2,Concept: Resounding silence. Question: Which of the following sentences best characterizes resounding silences? option: Resounding silences are literally deafening. option: Resounding silences are quiet. option:...,"['Resounding silences are quiet.', 'Resounding silences are literally deafening.', 'Resounding silences are produced by musical instruments.', 'Resounding silences are a bad omen.']",['Resounding silences are quiet.'],Resounding silences are quiet.,1
3,Concept: Extroverted introvert. Question: Which of the following sentences best characterizes extroverted introverts? option: Extroverted introverts are friendly but like to be alone. option: Extroverted...,"['Extroverted introverts are friendly but like to be alone.', 'Extroverted introverts should never meet strangers in public places.', 'Extroverted introverts are B-list celebrities.', 'Extroverted introverts...",['Extroverted introverts are friendly but like to be alone.'],Extroverted introverts are friendly but like to be alone.,1
4,Concept: Pretty ugly. Question: Which of the following sentences best characterizes pretty ugly people? option: Pretty ugly people are homely. option: Pretty ugly people are...,"['Pretty ugly people are homely.', 'Pretty ugly people are pretty.', 'Pretty ugly people are dangerous.', 'Pretty ugly people are morally corrupt.']",['Pretty ugly people are homely.'],Pretty ugly people are homely.,1
5,Concept: Old news. Question: Which of the following sentences best characterizes old news? option: Old news is a liberal media conspiracy. option: Old news is...,"['Old news is established knowledge.', 'Old news is reporting about old people.', 'Old news is a liberal media conspiracy.', 'New discoveries about the past are...",['Old news is established knowledge.'],Old news is established knowledge.,1
6,Concept: Long shorts. Question: Which of the following sentences best characterizes long shorts? option: Long shorts are risky. option: Long shorts end at the knee....,"['Long shorts end at the knee.', 'Long shorts are made with extra water.', 'Long shorts are too revealing.', 'Long shorts are risky.']",['Long shorts end at the knee.'],Long shorts end at the knee.,1
7,Concept: turbulent peace. Question: Which of the following sentences best characterizes turbulent peaces? option: Turbulent peace is full of harmony. option: Turbulent peace is windy....,"['Turbulent peace is short-lived.', 'Turbulent peace is dangerous for planes.', 'Turbulent peace is windy.', 'Turbulent peace is full of harmony.']",['Turbulent peace is short-lived.'],Turbulent peace is short-lived.,1
8,Concept: confident doubter. Question: Which of the following sentences best characterizes confident doubters? option: Confident doubters are easy to persuade. option: Confident doubters are secure...,"['Confident doubters are secure in their mistrust.', 'Confident doubters are easy to persuade.', 'Confident doubters are good confidants.', 'Confident doubters are debatable.']",['Confident doubters are secure in their mistrust.'],Confident doubters are secure in their mistrust.,1
9,Concept: timeless times. Question: Which of the following sentences best characterizes timeless times? option: The 60's were timeless times. option: Timeless times go by quickly....,"[""The 60's were timeless times."", 'People often run out of timeless times.', 'Timeless times go by quickly.', 'Timeless times are quickly forgotten.']","[""The 60's were timeless times.""]",The 60's were timeless times.,1


In [375]:
evaluation_score

97.0

In [376]:
outputs = pd.DataFrame(outputs)

In [377]:
outputs.columns = ['problem','response','result']

In [378]:
outputs.iloc[95,0]

Example({'question': "The word 'lopso' is an adjective for people who cannot hear. Question: Which of the following sentences best characterizes lopso composers?\n option: Lopso composers listen to Mozart.\n option: Lopso composers only do blind auditions.\n option: Lopso composers have to imagine the sound of their music.\n option: Lopso composers make electronic dance music.\nAnswer:", 'options': ['Lopso composers have to imagine the sound of their music.', 'Lopso composers only do blind auditions.', 'Lopso composers make electronic dance music.', 'Lopso composers listen to Mozart.'], 'answer': ['Lopso composers have to imagine the sound of their music.']}) (input_keys={'question', 'options'})

In [379]:
outputs.to_json('conceptual_combinations_GPT_4o_3_shot_results.json',orient='records')