In [30]:
import dspy
import copy
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, MIPRO
from dspy.evaluate import Evaluate
from dspy.teleprompt.ensemble import Ensemble
from datasets import load_dataset
import random
import re
import os

random.seed(0)

In [31]:
lm = dspy.HFClientVLLM(model="meta-llama/Meta-Llama-3-8B",
                       port=8080,
                       url="http://localhost")
dspy.settings.configure(lm=lm)
NUM_THREADS = 8

# Dataset

* Create a dataset using the dspy.Example class
* We will use the Ultra Feedback dataset
    * 1 instruction
    * 4 possible completions
    * all of them are rated by gpt4
* Our goal is to find a good prompt to get the best RM out of Llama3
* We will create a train, valid, and test dataset
* We will evaluate our model using exact match

In [32]:
dataset = load_dataset("openbmb/UltraFeedback")
all_data = []
for input_ in dataset["train"]:
    completions = sorted(
        input_["completions"], key=lambda x: x["overall_score"], reverse=True
    )
    # only take the top and bottom completions
    for i, chosen_completion in enumerate(completions[:1]):
        for rejected_completion in completions[-1:]:
            if (
                chosen_completion["overall_score"]
                == rejected_completion["overall_score"]
            ):
                continue
            if random.random() < 0.5:
                text1 = chosen_completion["response"]
                text2 = rejected_completion["response"]
                preference = "1"
            else:
                text1 = rejected_completion["response"]
                text2 = chosen_completion["response"]
                preference = "2"
                
            # llama 3 has a smart context window
            if len(text1) > 1524 or len(text2) > 1524 or len(input_["instruction"]) > 1524:
                continue
            
            all_data.append(
                dspy.Example(
                    **{
                        "instruction": input_["instruction"],
                        "text_1": text1,
                        "text_2": text2,
                        "preferred_text": preference,
                    }
                ).with_inputs("instruction", "text_1", "text_2")
            )
    if len(all_data) > 2000:
        break

random.shuffle(all_data)

valid = all_data[:100]
test = all_data[100:600]
train = all_data[600:]

  table = cls._concat_blocks(blocks, axis=0)
Average Metric: 46 / 72  (63.9):  72%|███████▏  | 72/100 [00:16<00:02, 13.00it/s]

In [33]:
def extract_pref(example, pred):
    pref_pred = re.search(r"\d+", pred["preferred_text"])
    if pref_pred:
        pref_pred = pref_pred.group()
    else:
        pref_pred = None
    return pref_pred


def em_metric(example, pred, trace=None, frac=1.0, verbose=False):
    pref_pred = extract_pref(example, pred)
    if verbose:
        print(f"Example: {example['preferred_text']}")
        print(f"Prediction: {pref_pred}")
    score = example["preferred_text"] == pref_pred
    if score is None:
        return False
    return score

    


eval_fn = Evaluate(devset=test, metric=em_metric, num_threads=NUM_THREADS, display_progress=True)

In [34]:
class Preference(dspy.Signature):
    instruction = dspy.InputField()
    text_1 = dspy.InputField()
    text_2 = dspy.InputField()
    preferred_text = dspy.OutputField(desc="Return the preferred text (1 or 2)", prefix="preferred_text:")

class PrefPredict(dspy.Module):
    def __init__(self):
        super().__init__()
        self.preference = dspy.Predict(
            Preference, max_tokens=3, temperature=0.1
        ) 

    def forward(self, instruction, text_1, text_2, *args, **kwargs):
        preferred = self.preference(
            instruction=instruction,
            text_1=text_1,
            text_2=text_2,
        )
        return preferred

In [35]:
pred = PrefPredict()(**train[0])
print(pred)
em_metric(train[0], pred, verbose=True)

Prediction(
    preferred_text='2'
)
Example: 2
Prediction: 2


True

In [36]:
lm.inspect_history(n=3)





Given the fields `instruction`, `text_1`, `text_2`, produce the fields `preferred_text`.

---

Follow the following format.

Instruction: ${instruction}

Text 1: ${text_1}

Text 2: ${text_2}

preferred_text: Return the preferred text (1 or 2)

---

Instruction: Given a passage, rewrite it in the present tense. Yesterday I went to the grocery store to buy some vegetables.

Text 1: I am going to the grocery store to buy some vegetables.

Text 2: Today I go to the grocery store to buy some vegetables.

preferred_text:[32m 2

[0m





In [37]:
eval_fn(PrefPredict())



Average Metric: 225 / 500  (45.0): 100%|██████████| 500/500 [00:05<00:00, 90.73it/s]

Average Metric: 225 / 500  (45.0%)





45.0

# Bootstrapping Few Shot Example with Random Search

In [38]:
boot_fs = BootstrapFewShotWithRandomSearch(metric=em_metric, max_bootstrapped_demos=4, max_labeled_demos=4,
                                           num_threads=NUM_THREADS, max_rounds=1, num_candidate_programs=12)

preference_model = boot_fs.compile(PrefPredict(), trainset=train, valset=valid)

Going to sample between 1 and 4 traces per predictor.
Will attempt to train 12 candidate sets.




Average Metric: 42 / 100  (42.0): 100%|██████████| 100/100 [00:01<00:00, 97.18it/s]


Average Metric: 42 / 100  (42.0%)
Score: 42.0 for set: [0]
New best score: 42.0 for seed -3
Scores so far: [42.0]
Best score: 42.0


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:07<00:00, 12.63it/s]


Average Metric: 60 / 100  (60.0%)
Score: 60.0 for set: [4]
New best score: 60.0 for seed -2
Scores so far: [42.0, 60.0]
Best score: 60.0


  0%|          | 6/1401 [00:00<02:12, 10.54it/s]


Bootstrapped 4 full traces after 7 examples in round 0.


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:04<00:00, 20.31it/s]


Average Metric: 54 / 100  (54.0%)
Score: 54.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0]
Best score: 60.0
Average of max per entry across top 1 scores: 0.6
Average of max per entry across top 2 scores: 0.78
Average of max per entry across top 3 scores: 0.86
Average of max per entry across top 5 scores: 0.86
Average of max per entry across top 8 scores: 0.86
Average of max per entry across top 9999 scores: 0.86


  0%|          | 5/1401 [00:00<01:55, 12.11it/s]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:07<00:00, 14.10it/s]


Average Metric: 56 / 100  (56.0%)
Score: 56.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0]
Best score: 60.0
Average of max per entry across top 1 scores: 0.6
Average of max per entry across top 2 scores: 0.78
Average of max per entry across top 3 scores: 0.9
Average of max per entry across top 5 scores: 0.94
Average of max per entry across top 8 scores: 0.94
Average of max per entry across top 9999 scores: 0.94


  0%|          | 4/1401 [00:00<02:25,  9.61it/s]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:05<00:00, 19.98it/s]


Average Metric: 61 / 100  (61.0%)
Score: 61.0 for set: [4]
New best score: 61.0 for seed 1
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0]
Best score: 61.0
Average of max per entry across top 1 scores: 0.61
Average of max per entry across top 2 scores: 0.83
Average of max per entry across top 3 scores: 0.93
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<01:48, 12.93it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:06<00:00, 15.30it/s]


Average Metric: 56 / 100  (56.0%)
Score: 56.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0]
Best score: 61.0
Average of max per entry across top 1 scores: 0.61
Average of max per entry across top 2 scores: 0.83
Average of max per entry across top 3 scores: 0.93
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<01:38, 14.26it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:08<00:00, 11.82it/s]


Average Metric: 63 / 100  (63.0%)
Score: 63.0 for set: [4]
New best score: 63.0 for seed 3
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.85
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<01:45, 13.23it/s]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 57 / 100  (57.0): 100%|██████████| 100/100 [00:06<00:00, 14.37it/s]


Average Metric: 57 / 100  (57.0%)
Score: 57.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0, 57.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.85
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 7/1401 [00:00<01:39, 14.06it/s]


Bootstrapped 3 full traces after 8 examples in round 0.


Average Metric: 55 / 100  (55.0): 100%|██████████| 100/100 [00:03<00:00, 26.46it/s]


Average Metric: 55 / 100  (55.0%)
Score: 55.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0, 57.0, 55.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.85
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:00<02:12, 10.54it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 59 / 100  (59.0): 100%|██████████| 100/100 [00:04<00:00, 20.53it/s]


Average Metric: 59 / 100  (59.0%)
Score: 59.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0, 57.0, 55.0, 59.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.85
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<01:43, 13.51it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 62 / 100  (62.0): 100%|██████████| 100/100 [00:05<00:00, 17.07it/s]


Average Metric: 62 / 100  (62.0%)
Score: 62.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0, 57.0, 55.0, 59.0, 62.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.82
Average of max per entry across top 3 scores: 0.93
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<02:25,  9.62it/s]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:09<00:00, 10.98it/s]


Average Metric: 60 / 100  (60.0%)
Score: 60.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0, 57.0, 55.0, 59.0, 62.0, 60.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.82
Average of max per entry across top 3 scores: 0.93
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  1%|          | 9/1401 [00:00<02:02, 11.37it/s]


Bootstrapped 4 full traces after 10 examples in round 0.


Average Metric: 62 / 100  (62.0): 100%|██████████| 100/100 [00:08<00:00, 11.59it/s]


Average Metric: 62 / 100  (62.0%)
Score: 62.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0, 57.0, 55.0, 59.0, 62.0, 60.0, 62.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.82
Average of max per entry across top 3 scores: 0.9
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<02:01, 11.53it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:06<00:00, 16.31it/s]


Average Metric: 56 / 100  (56.0%)
Score: 56.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0, 57.0, 55.0, 59.0, 62.0, 60.0, 62.0, 56.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.82
Average of max per entry across top 3 scores: 0.9
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 7/1401 [00:00<01:49, 12.70it/s]


Bootstrapped 4 full traces after 8 examples in round 0.


Average Metric: 59 / 100  (59.0): 100%|██████████| 100/100 [00:08<00:00, 11.56it/s]

Average Metric: 59 / 100  (59.0%)
Score: 59.0 for set: [4]
Scores so far: [42.0, 60.0, 54.0, 56.0, 61.0, 56.0, 63.0, 57.0, 55.0, 59.0, 62.0, 60.0, 62.0, 56.0, 59.0]
Best score: 63.0
Average of max per entry across top 1 scores: 0.63
Average of max per entry across top 2 scores: 0.82
Average of max per entry across top 3 scores: 0.9
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
15 candidate programs found.





In [39]:
os.makedirs('prompts', exist_ok=True)
for idx, prog in enumerate([x[-1] for x in preference_model.candidate_programs[:3]]):
    prog.save(f'prompts/preference_model_{idx}.json')

In [40]:
eval_fn(preference_model)



Average Metric: 317 / 500  (63.4): 100%|██████████| 500/500 [01:36<00:00,  5.18it/s]

Average Metric: 317 / 500  (63.4%)





63.4

# Using an ensemble

In [41]:
ensemble_optimizer = Ensemble(reduce_fn=dspy.majority)
programs = [x[-1] for x in preference_model.candidate_programs]
ensemble_preference_model = ensemble_optimizer.compile(programs[:3])
eval_fn(ensemble_preference_model)



Average Metric: 320 / 500  (64.0): 100%|██████████| 500/500 [00:58<00:00,  8.52it/s]

Average Metric: 320 / 500  (64.0%)





64.0

# Adding Chain of Thought

In [42]:
class CoTPrefPredict(dspy.Module):
    def __init__(self):
        super().__init__()
        self.preference = dspy.ChainOfThought(
            Preference
        ) 

    def forward(self, instruction, text_1, text_2, *args, **kwargs):
        preferred = self.preference(
            instruction=instruction,
            text_1=text_1,
            text_2=text_2,
        )
        return preferred

In [43]:
cot_preference_model = boot_fs.compile(CoTPrefPredict(), trainset=train, valset=valid)



Average Metric: 24 / 100  (24.0): 100%|██████████| 100/100 [00:23<00:00,  4.33it/s]


Average Metric: 24 / 100  (24.0%)
Score: 24.0 for set: [0]
New best score: 24.0 for seed -3
Scores so far: [24.0]
Best score: 24.0


Average Metric: 58 / 100  (58.0): 100%|██████████| 100/100 [00:39<00:00,  2.53it/s]


Average Metric: 58 / 100  (58.0%)
Score: 58.0 for set: [4]
New best score: 58.0 for seed -2
Scores so far: [24.0, 58.0]
Best score: 58.0


  1%|          | 8/1401 [00:12<37:07,  1.60s/it]


Bootstrapped 4 full traces after 9 examples in round 0.


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:33<00:00,  2.96it/s]


Average Metric: 54 / 100  (54.0%)
Score: 54.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0]
Best score: 58.0
Average of max per entry across top 1 scores: 0.58
Average of max per entry across top 2 scores: 0.83
Average of max per entry across top 3 scores: 0.88
Average of max per entry across top 5 scores: 0.88
Average of max per entry across top 8 scores: 0.88
Average of max per entry across top 9999 scores: 0.88


  0%|          | 5/1401 [00:09<43:28,  1.87s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 55 / 100  (55.0): 100%|██████████| 100/100 [00:44<00:00,  2.25it/s]


Average Metric: 55 / 100  (55.0%)
Score: 55.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0]
Best score: 58.0
Average of max per entry across top 1 scores: 0.58
Average of max per entry across top 2 scores: 0.82
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.93
Average of max per entry across top 8 scores: 0.93
Average of max per entry across top 9999 scores: 0.93


  0%|          | 4/1401 [00:07<43:43,  1.88s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:42<00:00,  2.33it/s]


Average Metric: 61 / 100  (61.0%)
Score: 61.0 for set: [4]
New best score: 61.0 for seed 1
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0]
Best score: 61.0
Average of max per entry across top 1 scores: 0.61
Average of max per entry across top 2 scores: 0.85
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.97
Average of max per entry across top 8 scores: 0.97
Average of max per entry across top 9999 scores: 0.97


  0%|          | 1/1401 [00:02<46:59,  2.01s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:44<00:00,  2.23it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [4]
New best score: 68.0 for seed 2
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99


  0%|          | 2/1401 [00:02<30:21,  1.30s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 57 / 100  (57.0): 100%|██████████| 100/100 [00:43<00:00,  2.31it/s]


Average Metric: 57 / 100  (57.0%)
Score: 57.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.92
Average of max per entry across top 5 scores: 0.97
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99


  0%|          | 2/1401 [00:03<38:19,  1.64s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:40<00:00,  2.50it/s]


Average Metric: 60 / 100  (60.0%)
Score: 60.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0, 60.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.88
Average of max per entry across top 5 scores: 0.96
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99


  0%|          | 4/1401 [00:06<38:17,  1.64s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 58 / 100  (58.0): 100%|██████████| 100/100 [00:34<00:00,  2.92it/s]


Average Metric: 58 / 100  (58.0%)
Score: 58.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0, 60.0, 58.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.88
Average of max per entry across top 5 scores: 0.94
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99


  0%|          | 1/1401 [00:01<31:07,  1.33s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:34<00:00,  2.88it/s]


Average Metric: 61 / 100  (61.0%)
Score: 61.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0, 60.0, 58.0, 61.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.91
Average of max per entry across top 5 scores: 0.95
Average of max per entry across top 8 scores: 0.98
Average of max per entry across top 9999 scores: 0.99


  0%|          | 4/1401 [00:08<46:58,  2.02s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:45<00:00,  2.19it/s]


Average Metric: 60 / 100  (60.0%)
Score: 60.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0, 60.0, 58.0, 61.0, 60.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.91
Average of max per entry across top 5 scores: 0.95
Average of max per entry across top 8 scores: 0.97
Average of max per entry across top 9999 scores: 0.99


  0%|          | 3/1401 [00:04<36:43,  1.58s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 64 / 100  (64.0): 100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


Average Metric: 64 / 100  (64.0%)
Score: 64.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0, 60.0, 58.0, 61.0, 60.0, 64.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.97
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99


  0%|          | 5/1401 [00:09<43:56,  1.89s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:47<00:00,  2.08it/s]


Average Metric: 54 / 100  (54.0%)
Score: 54.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0, 60.0, 58.0, 61.0, 60.0, 64.0, 54.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.97
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99


  0%|          | 2/1401 [00:03<39:17,  1.69s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 58 / 100  (58.0): 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]


Average Metric: 58 / 100  (58.0%)
Score: 58.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0, 60.0, 58.0, 61.0, 60.0, 64.0, 54.0, 58.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.97
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99


  0%|          | 7/1401 [00:11<37:47,  1.63s/it]


Bootstrapped 4 full traces after 8 examples in round 0.


Average Metric: 57 / 100  (57.0): 100%|██████████| 100/100 [00:40<00:00,  2.47it/s]

Average Metric: 57 / 100  (57.0%)
Score: 57.0 for set: [4]
Scores so far: [24.0, 58.0, 54.0, 55.0, 61.0, 68.0, 57.0, 60.0, 58.0, 61.0, 60.0, 64.0, 54.0, 58.0, 57.0]
Best score: 68.0
Average of max per entry across top 1 scores: 0.68
Average of max per entry across top 2 scores: 0.86
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.97
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99
15 candidate programs found.





In [44]:
eval_fn(cot_preference_model)



Average Metric: 326 / 500  (65.2): 100%|██████████| 500/500 [03:33<00:00,  2.34it/s]

Average Metric: 326 / 500  (65.2%)





65.2

In [45]:
for idx, prog in enumerate([x[-1] for x in cot_preference_model.candidate_programs[:3]]):
    prog.save(f'prompts/cot_preference_model_{idx}.json')

In [46]:
mipro = MIPRO(metric=em_metric, init_temperature=1.0, num_candidates=4)
kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=4)
mipro_preference_model = mipro.compile(student=PrefPredict(), trainset=train, num_trials=10, max_bootstrapped_demos=1, 
                                        max_labeled_demos=0, eval_kwargs=kwargs, requires_permission_to_run=False)



Please be advised that based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Task Model: [94m[1m1401[0m[93m examples in dev set * [94m[1m10[0m[93m trials * [94m[1m# of LM calls in your program[0m[93m = ([94m[1m14010 * # of LM calls in your program[0m[93m) task model calls[0m
[93m- Prompt Model: # data summarizer calls (max [94m[1m10[0m[93m) + [94m[1m4[0m[93m * [94m[1m1[0m[93m lm calls in program = [94m[1m14[0m[93m prompt model calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of calls to prompt model * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).[0m

For a preliminary estimate of potential cos

  0%|          | 1/1401 [00:00<00:05, 274.91it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


  0%|          | 1/1401 [00:00<00:05, 262.83it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


  0%|          | 1/1401 [00:00<01:04, 21.81it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


In [None]:
eval_fn(mipro_preference_model)

# Additional ressources

*