# Big Bench

In [None]:
!pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz" -q

In [None]:
!pip install dspy datasets python-Levenshtein -q

In [3]:
import os
import pandas as pd
from datasets import load_dataset
import dspy
import Levenshtein

In [4]:
ds = load_dataset("google/bigbench", "crash_blossom")["default"]
examples = [dspy.Example({"question": r["inputs"], "options": r["multiple_choice_targets"], "answer":r['targets']}).with_inputs("question","options") for r in ds]
print(f"There are {len(examples)} examples.")
trainset = examples[0:3]
valset = examples[3:]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/20.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/99.8k [00:00<?, ?B/s]

Generating default split:   0%|          | 0/38 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

There are 38 examples.


In [5]:
examples[0]

Example({'question': 'Identify the part of speech (verb, adjective, noun, or preposition) of the specified word in the following headlines.\n\nIn the following sentence, what part of speech is stab? Sentence: St Jude Ambulance to teach youth to help stab victims\nA:', 'options': ['verb', 'adjective', 'noun', 'preposition'], 'answer': ['adjective']}) (input_keys={'options', 'question'})

In [6]:
lm = dspy.OpenAI(model="gpt-4o")
dspy.settings.configure(lm=lm)

In [7]:
lm(examples[0].question)

['In the sentence "St Jude Ambulance to teach youth to help stab victims," the word "stab" is used as an adjective. It describes the type of victims that the youth will be taught to help, specifically those who have been stabbed.']

In [8]:
def custom_format_handler(value):
    if isinstance(value, list):
        return ' | '.join(map(str, value))
    return value

In [9]:
class CoTSignature(dspy.Signature):
    question = dspy.InputField()
    options = dspy.InputField(format=custom_format_handler)
    answer = dspy.OutputField(format=custom_format_handler)

In [10]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.signature = CoTSignature
        self.predictor = dspy.ChainOfThought(self.signature)
    def forward(self, question, options):
      result = self.predictor(lm=lm,question=question, options=options)
      return dspy.Prediction(answer=result.answer)

In [11]:
cot_module = CoT()
test_q = examples[0]
prediction = cot_module.forward(question=test_q.question, options=test_q.options)
print(f"Sense: {prediction.answer}")

Sense: adjective


In [12]:
def wsd_metric(example, pred, trace=None, threshold=0.90):
    """Metric function for word sense disambiguation with Levenshtein distance."""
    gold_sense = example.answer[0]
    predicted_sense = pred.answer

    # Calculate Levenshtein similarity
    similarity = Levenshtein.ratio(gold_sense.lower(), predicted_sense.lower())

    # Check if similarity exceeds the threshold
    return int(similarity >= threshold)


In [13]:
from dspy.teleprompt import BootstrapFewShot

# Set up the optimizer: we want to "bootstrap" (i.e., self-generate) 4-shot examples of our CoT program.
config = dict(max_bootstrapped_demos=0, max_labeled_demos=3)

# Optimize! Use the `gsm8k_metric` here. In general, the metric is going to tell the optimizer how well it's doing.
teleprompter = BootstrapFewShot(metric=wsd_metric, **config)
optimized_cot = teleprompter.compile(CoT(), trainset=trainset)

  0%|          | 0/3 [00:00<?, ?it/s]

Bootstrapped 0 full traces after 1 examples in round 0.





In [14]:
optimized_cot.forward(question=test_q.question, options=test_q.options)

Prediction(
    answer='adjective'
)

In [20]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(devset=valset, num_threads=2, display_progress=True, display_table=5,return_outputs=True)

# Launch evaluation.
evaluation_score, outputs = evaluator(optimized_cot, metric=wsd_metric)

Average Metric: 23 / 35  (65.7): 100%|██████████| 35/35 [00:00<00:00, 703.56it/s]

Average Metric: 23 / 35  (65.7%)





Unnamed: 0,question,options,example_answer,pred_answer,wsd_metric
0,"Identify the part of speech (verb, adjective, noun, or preposition) of the specified word in the following headlines. In the following sentence, what part of...","['verb', 'adjective', 'noun', 'preposition']",['adjective'],adjective,1
1,"Identify the part of speech (verb, adjective, noun, or preposition) of the specified word in the following headlines. In the following sentence, what part of...","['verb', 'adjective', 'noun', 'preposition']",['noun'],verb,0
2,"Identify the part of speech (verb, adjective, noun, or preposition) of the specified word in the following headlines. In the following sentence, what part of...","['verb', 'adjective', 'noun', 'preposition']",['noun'],verb,0
3,"Identify the part of speech (verb, adjective, noun, or preposition) of the specified word in the following headlines. In the following sentence, what part of...","['verb', 'adjective', 'noun', 'preposition']",['verb'],verb,1
4,"Identify the part of speech (verb, adjective, noun, or preposition) of the specified word in the following headlines. In the following sentence, what part of...","['verb', 'adjective', 'noun', 'preposition']",['noun'],adjective,0


In [21]:
outputs = pd.DataFrame(outputs)

In [22]:
outputs.iloc[0,0]

Example({'question': 'Identify the part of speech (verb, adjective, noun, or preposition) of the specified word in the following headlines.\n\nIn the following sentence, what part of speech is stocks? Sentence: Fish stocks surge\nA:', 'options': ['verb', 'adjective', 'noun', 'preposition'], 'answer': ['noun']}) (input_keys={'options', 'question'})

In [23]:
outputs.columns = ['problem','response','result']

In [24]:
outputs.to_json('crash_blossom_GPT_4o_3_shot_results.json',orient='records')