# Big Bench

In [None]:
!pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz" -q

In [None]:
!pip install dspy datasets python-Levenshtein -q

In [98]:
import os
import pandas as pd
from datasets import load_dataset
import dspy
import Levenshtein
import random
from openai import OpenAI
client = OpenAI()

In [314]:
ds = load_dataset("google/bigbench", "cryptonite")["default"]
examples = [dspy.Example({"question": r["inputs"], "options": r["multiple_choice_targets"], "answer":r['targets']}).with_inputs("question","options") for r in ds]
print(f"There are {len(examples)} examples.")
trainset = random.sample(examples, 3)
valset = random.sample([i for i in examples if i not in trainset],200)

Generating default split:   0%|          | 0/26157 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20926 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5231 [00:00<?, ? examples/s]

There are 26157 examples.


In [315]:
examples[74]

Example({'question': 'Cryptic crossword clue: overact, twirling new sari, greeting hindu sage (9)\nAnswer:', 'options': [], 'answer': ['maharishi']}) (input_keys={'question', 'options'})

In [316]:
lm = dspy.OpenAI(model="gpt-4o-2024-05-13")
dspy.settings.configure(lm=lm)

In [317]:
def custom_format_handler(value):
    if isinstance(value, list):
        return ' | '.join(map(str, value))
        #return value[-1]
    return value

In [318]:
class CoTSignature(dspy.Signature):
    """Play a language game. Only produce the answer."""
    question = dspy.InputField()
    #options = dspy.InputField(format=custom_format_handler)
    answer = dspy.OutputField(format=custom_format_handler)

In [319]:
simple_predict = dspy.Predict(CoTSignature)

In [320]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.signature = CoTSignature
        self.predictor = dspy.ChainOfThought(self.signature)
    def forward(self, question, options):
      result = self.predictor(lm=lm,question=question, options=options)
      return dspy.Prediction(answer=result.answer)

In [321]:
cot_module = CoT()
test_q = examples[74]
prediction = cot_module.forward(question=test_q.question, options=test_q.options)
print(f"Sense: {prediction.answer}")

Sense: RISHIS


In [322]:
simple_predict(question=test_q.question, options=test_q.options)

Prediction(
    answer='Question: Cryptic crossword clue: overact, twirling new sari, greeting hindu sage (9) Answer:\nAnswer: Avatarist'
)

In [323]:
def wsd_metric(example, pred, trace=None, threshold=0.90):
    """Metric function for word sense disambiguation with Levenshtein distance."""
    gold_sense = example.answer[0]
    predicted_sense = pred.answer

    # Calculate Levenshtein similarity
    similarity = Levenshtein.ratio(gold_sense.lower(), predicted_sense.lower())

    # Check if similarity exceeds the threshold
    return int(similarity >= threshold)


In [324]:
from dspy.teleprompt import BootstrapFewShot

# Set up the optimizer: we want to "bootstrap" (i.e., self-generate) 4-shot examples of our CoT program.
config = dict(max_bootstrapped_demos=0, max_labeled_demos=3)

# Optimize! Use the `gsm8k_metric` here. In general, the metric is going to tell the optimizer how well it's doing.
teleprompter = BootstrapFewShot(metric=wsd_metric, **config)
optimized_cot = teleprompter.compile(CoT(), trainset=trainset)

  0%|          | 0/3 [00:00<?, ?it/s]

Bootstrapped 0 full traces after 1 examples in round 0.





In [325]:
optimized_cot.forward(question=test_q.question, options=test_q.options)

Prediction(
    answer='rishis'
)

In [326]:
lm.inspect_history(1)





Play a language game. Only produce the answer.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: ${answer}

---

Question: Cryptic crossword clue: raise belting bottom of bad waif? not in liberal establishment (5,6) Answer:
Answer: broad church

---

Question: Cryptic crossword clue: trumps higher, with amazing fervour, and rising with sudden emphasis (9) Answer:
Answer: overruffs

---

Question: Cryptic crossword clue: does view oddly cut off blossom? (7) Answer:
Answer: develop

---

Question: Cryptic crossword clue: overact, twirling new sari, greeting hindu sage (9) Answer:
Reasoning: Let's think step by step in order to[32m produce the answer. We need to identify the definition and wordplay in the clue. The definition is likely "Hindu sage." The wordplay involves an anagram (indicated by "twirling") of "new sari" plus a greeting. "New sari" anagrammed gives us "RISHI" and adding a co

In [327]:
examples[74]['answer']

['maharishi']

In [328]:
from dspy.evaluate import Evaluate

# Set up the evaluator, which can be re-used in your code.
evaluator = Evaluate(devset=valset, num_threads=2, display_progress=True, display_table=20,return_outputs=True)

# Launch evaluation.
evaluation_score, outputs = evaluator(optimized_cot, metric=wsd_metric)

Average Metric: 63 / 200  (31.5): 100%|██████████| 200/200 [03:37<00:00,  1.09s/it]

Average Metric: 63 / 200  (31.5%)





Unnamed: 0,question,options,example_answer,pred_answer,wsd_metric
0,Cryptic crossword clue: channel flowing over a gold coin once (5) Answer:,[],['ducat'],rival,0
1,"Cryptic crossword clue: smart, deleting line in permit? the opposite (4,5) Answer:",[],['vice versa'],Answer: play dumb,0
2,Cryptic crossword clue: urges simple reforms covering university -- and succeeded (8) Answer:,[],['impulses'],impulses,1
3,"Cryptic crossword clue: not a good place for harold, as things turned out (8) Answer:",[],['hastings'],Hastings,1
4,"Cryptic crossword clue: b & b abroad admits soak, making false claim (10) Answer:",[],['pretension'],braggadocio,0
5,Cryptic crossword clue: mother of lorraine or nancy lake (4) Answer:,[],['mere'],Anne,0
6,"Cryptic crossword clue: cover band is after new hit when on the road (5,5) Answer:",[],['third party'],cover version,0
7,"Cryptic crossword clue: heading for ditch, pal knocked from top of horse -- faller in effect? (6) Answer:",[],['domino'],delphi,0
8,Cryptic crossword clue: source of oil full of energy? (5) Answer:,[],['olive'],olive,1
9,Cryptic crossword clue: maintains a rising pressure (7) Answer:,[],['asserts'],asserts,1


In [329]:
evaluation_score

31.5

In [330]:
outputs = pd.DataFrame(outputs)

In [331]:
outputs.columns = ['problem','response','result']

In [332]:
outputs.iloc[95,0]

Example({'question': 'Cryptic crossword clue: gruff ace grabbed by hack maybe (6)\nAnswer:', 'options': [], 'answer': ['hoarse']}) (input_keys={'question', 'options'})

In [333]:
outputs.to_json('cryptonite_GPT_4o_3_shot_results.json',orient='records')