In [1]:
import dspy
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, MIPRO
from dspy.evaluate import Evaluate
from dspy.teleprompt.ensemble import Ensemble
from datasets import load_dataset
import random
import re
import os

# save all our hard work in the prompts directory
os.makedirs('prompts', exist_ok=True)

random.seed(0)

COMPILE_FS = False
COMPILE_MIPRO = False
COMPILE_COT = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
lm = dspy.HFClientVLLM(model="meta-llama/Meta-Llama-3-8B-Instruct",
                       port=8080,
                       url="http://localhost")
dspy.settings.configure(lm=lm)
NUM_THREADS = 8

# Dataset

* Create a dataset using the dspy.Example class
* We will use the Ultra Feedback dataset
    * 1 instruction
    * 4 possible completions
    * all of them are rated by gpt4
* Our goal is to find a good prompt to get the best RM out of Llama3
* We will create a train, valid, and test dataset
* We will evaluate our model using exact match

In [3]:
dataset = load_dataset("openbmb/UltraFeedback")
all_data = []
for input_ in dataset["train"]:
    completions = sorted(
        input_["completions"], key=lambda x: x["overall_score"], reverse=True
    )
    # only take the top and bottom completions
    for i, chosen_completion in enumerate(completions[:1]):
        for rejected_completion in completions[-1:]:
            if (
                chosen_completion["overall_score"]
                == rejected_completion["overall_score"]
            ):
                continue
            if random.random() < 0.5:
                text1 = chosen_completion["response"]
                text2 = rejected_completion["response"]
                preference = "1"
            else:
                text1 = rejected_completion["response"]
                text2 = chosen_completion["response"]
                preference = "2"
                
            if text1 == text2:
                continue
            # llama 3 has a smart context window
            if len(text1) > 1524 or len(text2) > 1524 or len(input_["instruction"]) > 1524:
                continue
            
            all_data.append(
                dspy.Example(
                    **{
                        "instruction": input_["instruction"],
                        "text_1": text1,
                        "text_2": text2,
                        "preferred_text": preference,
                    }
                ).with_inputs("instruction", "text_1", "text_2")
            )
    if len(all_data) > 2000:
        break

random.shuffle(all_data)

valid = all_data[:100]
test = all_data[100:600]
train = all_data[600:]

  table = cls._concat_blocks(blocks, axis=0)


In [4]:
def extract_pref(example, pred):
    pref_pred = re.search(r"\d+", pred["preferred_text"])
    if pref_pred:
        pref_pred = pref_pred.group()
    else:
        pref_pred = None
    return pref_pred


def em_metric(example, pred, trace=None, frac=1.0, verbose=False):
    pref_pred = extract_pref(example, pred)
    if verbose:
        print(f"Example: {example['preferred_text']}")
        print(f"Prediction: {pref_pred}")
    score = example["preferred_text"] == pref_pred
    if score is None:
        return False
    return score

eval_fn = Evaluate(devset=test, metric=em_metric, num_threads=NUM_THREADS, display_progress=True)

In [5]:
class Preference(dspy.Signature):
    instruction = dspy.InputField()
    text_1 = dspy.InputField()
    text_2 = dspy.InputField()
    preferred_text = dspy.OutputField(desc="Only return the preferred text (1 or 2) as an int", prefix="preferred_text:")

class PrefPredict(dspy.Module):
    def __init__(self):
        super().__init__()
        self.preference = dspy.Predict(
            Preference, max_tokens=3, temperature=0.1
        ) 

    def forward(self, instruction, text_1, text_2, *args, **kwargs):
        preferred = self.preference(
                        instruction=instruction,
            text_1=text_1,
            text_2=text_2,
        )
        return preferred

In [6]:
pred = PrefPredict()(**train[0])
print(pred)
em_metric(train[0], pred, verbose=True)

Prediction(
    preferred_text='1'
)
Example: 1
Prediction: 1


True

In [7]:
lm.inspect_history(n=1)





Given the fields `instruction`, `text_1`, `text_2`, produce the fields `preferred_text`.

---

Follow the following format.

Instruction: ${instruction}

Text 1: ${text_1}

Text 2: ${text_2}

preferred_text: Only return the preferred text (1 or 2) as an int

---

Instruction: For a given set of 10 food-related words, provide the frequency of their usage in a corpus of reviews and the average number of syllables for each word. Furthermore, can you identify the top 3 food-related words with the highest frequency of usage and calculate the standard deviation of their syllable counts across the corpus?

Text 1: Here are the frequencies of the ten food-related words in a corpus of restaurant reviews, along with the average number of syllables: - Pasta: 188 occurrences, 1.5 syllables - Salad: 157 occurrences, 1.7 syllables - Bread: 128 occurrences, 1.5 syllables - Chicken: 154 occurrences, 2.1 syllables - Rice: 115 occurrences, 1.5 syllables - Fish: 121 occurrences, 2.0 syllables - Meat:

In [8]:
eval_fn(PrefPredict())

Average Metric: 3 / 4  (75.0):   1%|          | 3/500 [00:00<00:01, 477.06it/s]  

Average Metric: 324 / 500  (64.8): 100%|██████████| 500/500 [00:05<00:00, 86.71it/s] 

Average Metric: 324 / 500  (64.8%)





64.8

# Bootstrapping Few Shot Example with Random Search

* Need a train and a valid set, a metric, and a LLM.
* Does not require demonstration, e.g., no demo CoT will be required.
* The LLM will create good traces that satisfy the metric.
* We will use random search to find the best prompt over generated traces and input/output pairs.

In [9]:
boot_fs = BootstrapFewShotWithRandomSearch(metric=em_metric, max_bootstrapped_demos=4, max_labeled_demos=4,
                                           num_threads=NUM_THREADS, max_rounds=1, num_candidate_programs=50)

if COMPILE_FS:
    preference_model = boot_fs.compile(PrefPredict(), trainset=train, valset=valid)
    ensemble_preference_model = [prog for *_, prog in preference_model.candidate_programs[:3]]
    for idx, prog in enumerate([x[-1] for x in ensemble_preference_model]):
        prog.save(f'prompts/preference_model_{idx}.json')
else:
    ensemble_preference_model = []
    for idx in range(3):
        prog = PrefPredict()
        prog.load(f'prompts/preference_model_{idx}.json')
        ensemble_preference_model.append(prog)

Going to sample between 1 and 4 traces per predictor.
Will attempt to train 50 candidate sets.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:01<00:00, 84.74it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [0]
New best score: 68.0 for seed -3
Scores so far: [68.0]
Best score: 68.0


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:06<00:00, 15.81it/s]


Average Metric: 74 / 100  (74.0%)
Score: 74.0 for set: [4]
New best score: 74.0 for seed -2
Scores so far: [68.0, 74.0]
Best score: 74.0


  0%|          | 4/1401 [00:00<02:06, 11.08it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:07<00:00, 14.27it/s]


Average Metric: 63 / 100  (63.0%)
Score: 63.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.92
Average of max per entry across top 3 scores: 0.95
Average of max per entry across top 5 scores: 0.95
Average of max per entry across top 8 scores: 0.95
Average of max per entry across top 9999 scores: 0.95


  1%|          | 8/1401 [00:00<02:09, 10.77it/s]


Bootstrapped 4 full traces after 9 examples in round 0.


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:05<00:00, 17.24it/s]


Average Metric: 73 / 100  (73.0%)
Score: 73.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.9
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.97
Average of max per entry across top 8 scores: 0.97
Average of max per entry across top 9999 scores: 0.97


  0%|          | 6/1401 [00:00<01:44, 13.37it/s]


Bootstrapped 2 full traces after 7 examples in round 0.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:06<00:00, 15.62it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.9
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.98
Average of max per entry across top 9999 scores: 0.98


  0%|          | 1/1401 [00:00<02:26,  9.57it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:06<00:00, 15.91it/s]


Average Metric: 74 / 100  (74.0%)
Score: 74.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.9
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<01:43, 13.44it/s]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:06<00:00, 14.80it/s]


Average Metric: 69 / 100  (69.0%)
Score: 69.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0]
Best score: 74.0
Average of max per entry across top 1 scores: 0.74
Average of max per entry across top 2 scores: 0.9
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<01:43, 13.51it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 82 / 100  (82.0): 100%|██████████| 100/100 [00:05<00:00, 19.84it/s]


Average Metric: 82 / 100  (82.0%)
Score: 82.0 for set: [4]
New best score: 82.0 for seed 4
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<01:54, 12.17it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:07<00:00, 13.51it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<01:42, 13.66it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:04<00:00, 24.80it/s]


Average Metric: 69 / 100  (69.0%)
Score: 69.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:00<01:54, 12.18it/s]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:08<00:00, 11.81it/s]


Average Metric: 63 / 100  (63.0%)
Score: 63.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<01:53, 12.33it/s]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:07<00:00, 12.96it/s]


Average Metric: 69 / 100  (69.0%)
Score: 69.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<02:02, 11.41it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:06<00:00, 15.51it/s]


Average Metric: 72 / 100  (72.0%)
Score: 72.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:00<01:52, 12.41it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:06<00:00, 14.84it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.95
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  1%|          | 9/1401 [00:00<01:52, 12.42it/s]


Bootstrapped 4 full traces after 10 examples in round 0.


Average Metric: 78 / 100  (78.0): 100%|██████████| 100/100 [00:04<00:00, 20.06it/s]


Average Metric: 78 / 100  (78.0%)
Score: 78.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 7/1401 [00:00<02:00, 11.60it/s]


Bootstrapped 4 full traces after 8 examples in round 0.


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:05<00:00, 18.72it/s]


Average Metric: 69 / 100  (69.0%)
Score: 69.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<02:19, 10.00it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:05<00:00, 19.37it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<01:57, 11.87it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 55 / 100  (55.0): 100%|██████████| 100/100 [00:06<00:00, 16.65it/s]


Average Metric: 55 / 100  (55.0%)
Score: 55.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<01:25, 16.41it/s]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:03<00:00, 31.97it/s]


Average Metric: 63 / 100  (63.0%)
Score: 63.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<02:02, 11.42it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:03<00:00, 26.64it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<01:53, 12.27it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:03<00:00, 29.13it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<01:30, 15.39it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:06<00:00, 14.54it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<02:35,  9.02it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:09<00:00, 11.05it/s]


Average Metric: 74 / 100  (74.0%)
Score: 74.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<01:59, 11.65it/s]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:08<00:00, 12.26it/s]


Average Metric: 65 / 100  (65.0%)
Score: 65.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<01:42, 13.67it/s]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:07<00:00, 13.88it/s]


Average Metric: 61 / 100  (61.0%)
Score: 61.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<02:06, 11.07it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 50 / 100  (50.0): 100%|██████████| 100/100 [00:08<00:00, 12.31it/s]


Average Metric: 50 / 100  (50.0%)
Score: 50.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<02:13, 10.45it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:04<00:00, 20.29it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 6/1401 [00:00<01:47, 13.02it/s]


Bootstrapped 4 full traces after 7 examples in round 0.


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:04<00:00, 23.60it/s]


Average Metric: 72 / 100  (72.0%)
Score: 72.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<01:58, 11.81it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 52 / 100  (52.0): 100%|██████████| 100/100 [00:07<00:00, 13.81it/s]


Average Metric: 52 / 100  (52.0%)
Score: 52.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<02:05, 11.13it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:07<00:00, 13.24it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:00<02:17, 10.15it/s]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:05<00:00, 17.19it/s]


Average Metric: 73 / 100  (73.0%)
Score: 73.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<01:48, 12.89it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:07<00:00, 12.93it/s]


Average Metric: 72 / 100  (72.0%)
Score: 72.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<01:33, 15.01it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:05<00:00, 18.33it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:00<02:42,  8.61it/s]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:04<00:00, 22.72it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<02:20,  9.94it/s]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:05<00:00, 17.30it/s]


Average Metric: 67 / 100  (67.0%)
Score: 67.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:00<01:43, 13.52it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 78 / 100  (78.0): 100%|██████████| 100/100 [00:07<00:00, 12.77it/s]


Average Metric: 78 / 100  (78.0%)
Score: 78.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 6/1401 [00:00<02:05, 11.10it/s]


Bootstrapped 2 full traces after 7 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:07<00:00, 14.22it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  1%|          | 8/1401 [00:00<01:45, 13.22it/s]


Bootstrapped 3 full traces after 9 examples in round 0.


Average Metric: 64 / 100  (64.0): 100%|██████████| 100/100 [00:04<00:00, 20.55it/s]


Average Metric: 64 / 100  (64.0%)
Score: 64.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:00<04:38,  5.02it/s]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:08<00:00, 11.33it/s]


Average Metric: 73 / 100  (73.0%)
Score: 73.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<02:27,  9.48it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:10<00:00,  9.86it/s]


Average Metric: 72 / 100  (72.0%)
Score: 72.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:00<01:59, 11.75it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:07<00:00, 13.55it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<01:44, 13.42it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 50 / 100  (50.0): 100%|██████████| 100/100 [00:06<00:00, 14.50it/s]


Average Metric: 50 / 100  (50.0%)
Score: 50.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:00<02:14, 10.40it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:06<00:00, 15.38it/s]


Average Metric: 66 / 100  (66.0%)
Score: 66.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<01:49, 12.72it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:05<00:00, 18.09it/s]


Average Metric: 74 / 100  (74.0%)
Score: 74.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:00<02:11, 10.60it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:06<00:00, 15.85it/s]


Average Metric: 65 / 100  (65.0%)
Score: 65.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:00<01:35, 14.61it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:05<00:00, 19.75it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0, 70.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:00<01:55, 12.11it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 64 / 100  (64.0): 100%|██████████| 100/100 [00:03<00:00, 25.21it/s]


Average Metric: 64 / 100  (64.0%)
Score: 64.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0, 70.0, 64.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  1%|          | 8/1401 [00:00<01:39, 14.00it/s]


Bootstrapped 4 full traces after 9 examples in round 0.


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:04<00:00, 21.94it/s]


Average Metric: 73 / 100  (73.0%)
Score: 73.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0, 70.0, 64.0, 73.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:00<02:32,  9.17it/s]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:06<00:00, 15.00it/s]


Average Metric: 74 / 100  (74.0%)
Score: 74.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0, 70.0, 64.0, 73.0, 74.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:00<01:34, 14.79it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:03<00:00, 27.57it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0, 70.0, 64.0, 73.0, 74.0, 68.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:00<01:42, 13.61it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:05<00:00, 19.66it/s]


Average Metric: 73 / 100  (73.0%)
Score: 73.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0, 70.0, 64.0, 73.0, 74.0, 68.0, 73.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 7/1401 [00:00<02:01, 11.43it/s]


Bootstrapped 3 full traces after 8 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:09<00:00, 10.81it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0, 70.0, 64.0, 73.0, 74.0, 68.0, 73.0, 70.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:00<01:42, 13.65it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 53 / 100  (53.0): 100%|██████████| 100/100 [00:04<00:00, 21.29it/s]

Average Metric: 53 / 100  (53.0%)
Score: 53.0 for set: [4]
Scores so far: [68.0, 74.0, 63.0, 73.0, 68.0, 74.0, 69.0, 82.0, 68.0, 69.0, 63.0, 69.0, 72.0, 70.0, 78.0, 69.0, 70.0, 55.0, 63.0, 70.0, 71.0, 70.0, 74.0, 65.0, 61.0, 50.0, 68.0, 72.0, 52.0, 70.0, 73.0, 72.0, 71.0, 71.0, 67.0, 78.0, 70.0, 64.0, 73.0, 72.0, 68.0, 50.0, 66.0, 74.0, 65.0, 70.0, 64.0, 73.0, 74.0, 68.0, 73.0, 70.0, 53.0]
Best score: 82.0
Average of max per entry across top 1 scores: 0.82
Average of max per entry across top 2 scores: 0.96
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
53 candidate programs found.





In [11]:
eval_fn(ensemble_preference_model[0])

  0%|          | 0/500 [00:00<?, ?it/s]

Average Metric: 342 / 500  (68.4): 100%|██████████| 500/500 [00:25<00:00, 19.47it/s]

Average Metric: 342 / 500  (68.4%)





68.4

In [18]:
lm.inspect_history(n=1)





Given the fields `instruction`, `text_1`, `text_2`, produce the fields `preferred_text`.

---

Instruction: Recognize whether the following phrase is in passive voice and identify the subject and object of the sentence. Additionally, determine the tense of the verb and indicate if the action was completed or ongoing at a specific point in time. #The Given Prompt#: Create a function that takes a list of integers and returns the largest product that can be made by multiplying any three integers. ``` def largest_product(lst): pass ``` #Rewritten Prompt#: Create a function that takes a list of integers and returns the largest product that can be made by multiplying any three distinct integers. However, if any of the integers are negative, the function should only consider the two smallest negative integers and the largest positive integer in the list for the calculation of the product. If there are no negative integers, the function should consider the three largest integers in the lis

# Using an ensemble

In [12]:
ensemble_optimizer = Ensemble(reduce_fn=dspy.majority)
ensemble_preference_model_fn = ensemble_optimizer.compile(ensemble_preference_model)
eval_fn(ensemble_preference_model_fn)

Average Metric: 345 / 500  (69.0): 100%|██████████| 500/500 [00:57<00:00,  8.68it/s]

Average Metric: 345 / 500  (69.0%)





69.0

# Adding Chain of Thought

In [13]:
class CoTPrefPredict(dspy.Module):
    def __init__(self):
        super().__init__()
        self.preference = dspy.ChainOfThought(
            Preference
        ) 

    def forward(self, instruction, text_1, text_2, *args, **kwargs):
        preferred = self.preference(
            instruction=instruction,
            text_1=text_1,
            text_2=text_2,
        )
        return preferred

In [14]:
if COMPILE_COT:
    cot_preference_model = boot_fs.compile(CoTPrefPredict(), trainset=train, valset=valid)
    ensemble_cot_preference_model = []
    for idx, prog in enumerate([x[-1] for x in cot_preference_model.candidate_programs[:3]]):
        prog.save(f'prompts/cot_preference_model_{idx}.json')
        ensemble_cot_preference_model.append(prog)
else:
    ensemble_cot_preference_model = []
    for idx in range(3):
        prog = CoTPrefPredict()
        prog.load(f'prompts/cot_preference_model_{idx}.json')
        ensemble_cot_preference_model.append(prog)
    

Average Metric: 1 / 1  (100.0):   0%|          | 0/100 [00:00<?, ?it/s]

Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:18<00:00,  5.37it/s]


Average Metric: 63 / 100  (63.0%)
Score: 63.0 for set: [0]
New best score: 63.0 for seed -3
Scores so far: [63.0]
Best score: 63.0


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Average Metric: 66 / 100  (66.0%)
Score: 66.0 for set: [4]
New best score: 66.0 for seed -2
Scores so far: [63.0, 66.0]
Best score: 66.0


  0%|          | 4/1401 [00:05<30:27,  1.31s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:32<00:00,  3.11it/s]


Average Metric: 66 / 100  (66.0%)
Score: 66.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0]
Best score: 66.0
Average of max per entry across top 1 scores: 0.66
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.91
Average of max per entry across top 5 scores: 0.91
Average of max per entry across top 8 scores: 0.91
Average of max per entry across top 9999 scores: 0.91


  1%|          | 8/1401 [00:11<32:22,  1.39s/it]


Bootstrapped 4 full traces after 9 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:31<00:00,  3.20it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
New best score: 71.0 for seed 0
Scores so far: [63.0, 66.0, 66.0, 71.0]
Best score: 71.0
Average of max per entry across top 1 scores: 0.71
Average of max per entry across top 2 scores: 0.9
Average of max per entry across top 3 scores: 0.95
Average of max per entry across top 5 scores: 0.97
Average of max per entry across top 8 scores: 0.97
Average of max per entry across top 9999 scores: 0.97


  0%|          | 6/1401 [00:07<29:49,  1.28s/it]


Bootstrapped 2 full traces after 7 examples in round 0.


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:31<00:00,  3.17it/s]


Average Metric: 69 / 100  (69.0%)
Score: 69.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0]
Best score: 71.0
Average of max per entry across top 1 scores: 0.71
Average of max per entry across top 2 scores: 0.91
Average of max per entry across top 3 scores: 0.97
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 0.99


  0%|          | 1/1401 [00:02<47:31,  2.04s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:32<00:00,  3.11it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0]
Best score: 71.0
Average of max per entry across top 1 scores: 0.71
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.99
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:04<35:47,  1.54s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:31<00:00,  3.15it/s]


Average Metric: 75 / 100  (75.0%)
Score: 75.0 for set: [4]
New best score: 75.0 for seed 3
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0]
Best score: 75.0
Average of max per entry across top 1 scores: 0.75
Average of max per entry across top 2 scores: 0.91
Average of max per entry across top 3 scores: 0.97
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:02<29:55,  1.28s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 77 / 100  (77.0): 100%|██████████| 100/100 [00:27<00:00,  3.63it/s]


Average Metric: 77 / 100  (77.0%)
Score: 77.0 for set: [4]
New best score: 77.0 for seed 4
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.97
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:04<35:27,  1.52s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:36<00:00,  2.70it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.97
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:05<29:46,  1.28s/it]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:27<00:00,  3.69it/s]


Average Metric: 69 / 100  (69.0%)
Score: 69.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.97
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:05<34:01,  1.46s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 51 / 100  (51.0): 100%|██████████| 100/100 [00:35<00:00,  2.79it/s]


Average Metric: 51 / 100  (51.0%)
Score: 51.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.97
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:04<35:20,  1.52s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:36<00:00,  2.76it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.97
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 6/1401 [00:10<40:42,  1.75s/it]


Bootstrapped 4 full traces after 7 examples in round 0.


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:32<00:00,  3.05it/s]


Average Metric: 72 / 100  (72.0%)
Score: 72.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:01<29:58,  1.28s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Average Metric: 72 / 100  (72.0%)
Score: 72.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  1%|          | 9/1401 [00:12<31:40,  1.37s/it]


Bootstrapped 4 full traces after 10 examples in round 0.


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:28<00:00,  3.49it/s]


Average Metric: 66 / 100  (66.0%)
Score: 66.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 6/1401 [00:08<32:52,  1.41s/it]


Bootstrapped 4 full traces after 7 examples in round 0.


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:35<00:00,  2.83it/s]


Average Metric: 72 / 100  (72.0%)
Score: 72.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:07<33:54,  1.46s/it]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:29<00:00,  3.38it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:02<48:41,  2.09s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 55 / 100  (55.0): 100%|██████████| 100/100 [00:30<00:00,  3.24it/s]


Average Metric: 55 / 100  (55.0%)
Score: 55.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:05<33:27,  1.44s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:27<00:00,  3.65it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:05<30:11,  1.30s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:26<00:00,  3.72it/s]


Average Metric: 73 / 100  (73.0%)
Score: 73.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:05<29:59,  1.29s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:29<00:00,  3.38it/s]


Average Metric: 73 / 100  (73.0%)
Score: 73.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:03<45:03,  1.93s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.93
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:02<30:51,  1.32s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 76 / 100  (76.0): 100%|██████████| 100/100 [00:31<00:00,  3.15it/s]


Average Metric: 76 / 100  (76.0%)
Score: 76.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:05<34:13,  1.47s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 76 / 100  (76.0): 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


Average Metric: 76 / 100  (76.0%)
Score: 76.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:05<29:53,  1.28s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:29<00:00,  3.35it/s]


Average Metric: 68 / 100  (68.0%)
Score: 68.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:03<38:21,  1.64s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 49 / 100  (49.0): 100%|██████████| 100/100 [00:32<00:00,  3.10it/s]


Average Metric: 49 / 100  (49.0%)
Score: 49.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:03<30:32,  1.31s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Average Metric: 65 / 100  (65.0%)
Score: 65.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:07<33:15,  1.43s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


Average Metric: 66 / 100  (66.0%)
Score: 66.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:05<34:10,  1.47s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 59 / 100  (59.0): 100%|██████████| 100/100 [00:37<00:00,  2.70it/s]


Average Metric: 59 / 100  (59.0%)
Score: 59.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:02<30:13,  1.30s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:29<00:00,  3.39it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:08<40:19,  1.73s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:35<00:00,  2.83it/s]


Average Metric: 69 / 100  (69.0%)
Score: 69.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:02<29:57,  1.28s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:27<00:00,  3.67it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:01<29:43,  1.27s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:27<00:00,  3.66it/s]


Average Metric: 72 / 100  (72.0%)
Score: 72.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:06<35:26,  1.52s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:29<00:00,  3.43it/s]


Average Metric: 63 / 100  (63.0%)
Score: 63.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:03<39:18,  1.69s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:27<00:00,  3.63it/s]


Average Metric: 61 / 100  (61.0%)
Score: 61.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.98
Average of max per entry across top 8 scores: 0.99
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:01<29:55,  1.28s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 76 / 100  (76.0): 100%|██████████| 100/100 [00:28<00:00,  3.50it/s]


Average Metric: 76 / 100  (76.0%)
Score: 76.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 2/1401 [00:02<30:16,  1.30s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:29<00:00,  3.43it/s]


Average Metric: 63 / 100  (63.0%)
Score: 63.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 7/1401 [00:09<32:08,  1.38s/it]


Bootstrapped 3 full traces after 8 examples in round 0.


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:33<00:00,  2.95it/s]


Average Metric: 65 / 100  (65.0%)
Score: 65.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:08<39:44,  1.71s/it]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:32<00:00,  3.11it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:05<41:55,  1.80s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:37<00:00,  2.67it/s]


Average Metric: 74 / 100  (74.0%)
Score: 74.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:01<30:23,  1.30s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:28<00:00,  3.47it/s]


Average Metric: 70 / 100  (70.0%)
Score: 70.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 4/1401 [00:06<37:55,  1.63s/it]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Average Metric: 65 / 100  (65.0%)
Score: 65.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:03<30:29,  1.31s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:27<00:00,  3.63it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:07<36:29,  1.57s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Average Metric: 66 / 100  (66.0%)
Score: 66.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:07<33:43,  1.45s/it]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:33<00:00,  2.98it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:01<29:51,  1.28s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Average Metric: 66 / 100  (66.0%)
Score: 66.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0, 66.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:01<30:19,  1.30s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:26<00:00,  3.81it/s]


Average Metric: 65 / 100  (65.0%)
Score: 65.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0, 66.0, 65.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 7/1401 [00:10<34:18,  1.48s/it]


Bootstrapped 4 full traces after 8 examples in round 0.


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:33<00:00,  3.03it/s]


Average Metric: 67 / 100  (67.0%)
Score: 67.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0, 66.0, 65.0, 67.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:04<36:32,  1.57s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:34<00:00,  2.88it/s]


Average Metric: 67 / 100  (67.0%)
Score: 67.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0, 66.0, 65.0, 67.0, 67.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:01<29:54,  1.28s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:25<00:00,  3.93it/s]


Average Metric: 63 / 100  (63.0%)
Score: 63.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0, 66.0, 65.0, 67.0, 67.0, 63.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 3/1401 [00:03<29:58,  1.29s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:28<00:00,  3.54it/s]


Average Metric: 69 / 100  (69.0%)
Score: 69.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0, 66.0, 65.0, 67.0, 67.0, 63.0, 69.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 5/1401 [00:07<33:46,  1.45s/it]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:40<00:00,  2.45it/s]


Average Metric: 71 / 100  (71.0%)
Score: 71.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0, 66.0, 65.0, 67.0, 67.0, 63.0, 69.0, 71.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 1/1401 [00:01<45:43,  1.96s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:31<00:00,  3.15it/s]

Average Metric: 56 / 100  (56.0%)
Score: 56.0 for set: [4]
Scores so far: [63.0, 66.0, 66.0, 71.0, 69.0, 71.0, 75.0, 77.0, 68.0, 69.0, 51.0, 71.0, 72.0, 72.0, 66.0, 72.0, 70.0, 55.0, 71.0, 73.0, 73.0, 70.0, 76.0, 76.0, 68.0, 49.0, 65.0, 66.0, 59.0, 71.0, 69.0, 71.0, 72.0, 63.0, 61.0, 76.0, 63.0, 65.0, 71.0, 74.0, 70.0, 65.0, 71.0, 66.0, 71.0, 66.0, 65.0, 67.0, 67.0, 63.0, 69.0, 71.0, 56.0]
Best score: 77.0
Average of max per entry across top 1 scores: 0.77
Average of max per entry across top 2 scores: 0.97
Average of max per entry across top 3 scores: 0.98
Average of max per entry across top 5 scores: 0.99
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
53 candidate programs found.





In [17]:
eval_fn(ensemble_cot_preference_model[0])

Average Metric: 352 / 500  (70.4): 100%|██████████| 500/500 [02:17<00:00,  3.63it/s]

Average Metric: 352 / 500  (70.4%)





70.4

In [19]:
lm.inspect_history(n)





Given the fields `instruction`, `text_1`, `text_2`, produce the fields `preferred_text`.

---

Instruction: Recognize whether the following phrase is in passive voice and identify the subject and object of the sentence. Additionally, determine the tense of the verb and indicate if the action was completed or ongoing at a specific point in time. #The Given Prompt#: Create a function that takes a list of integers and returns the largest product that can be made by multiplying any three integers. ``` def largest_product(lst): pass ``` #Rewritten Prompt#: Create a function that takes a list of integers and returns the largest product that can be made by multiplying any three distinct integers. However, if any of the integers are negative, the function should only consider the two smallest negative integers and the largest positive integer in the list for the calculation of the product. If there are no negative integers, the function should consider the three largest integers in the lis

In [22]:
ensemble_cot_preference_model_fn = ensemble_optimizer.compile(ensemble_cot_preference_model)
eval_fn(ensemble_cot_preference_model_fn)

Average Metric: 352 / 500  (70.4): 100%|██████████| 500/500 [05:00<00:00,  1.67it/s]

Average Metric: 352 / 500  (70.4%)





70.4

# MIPRO (Multi-prompt Instruction Proposal Optimizer)

https://twitter.com/kristahopsalong/status/1766166198079889737

* Takes a teacher and a student LLMs, a dataset, and a metric.
* Multi stage optimization
    * The teacher looks at inputs and outputs and summarize the data
    * The student generates good traces for a few input where the metric is validated
    * The teacher create instruction given the good traces and the summary of the data
    * Use some kind of Bayes Optimization to search over instructions and examples


In [23]:
teacher = lm
student = lm

mipro = MIPRO(prompt_model=teacher, task_model=student, metric=em_metric, init_temperature=1.0, num_candidates=8)
kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=4)
mipro_preference_model = mipro.compile(student=PrefPredict(), trainset=train, num_trials=20, max_bootstrapped_demos=4, 
                                        max_labeled_demos=4, eval_kwargs=kwargs, requires_permission_to_run=False)



Please be advised that based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Task Model: [94m[1m1401[0m[93m examples in dev set * [94m[1m20[0m[93m trials * [94m[1m# of LM calls in your program[0m[93m = ([94m[1m28020 * # of LM calls in your program[0m[93m) task model calls[0m
[93m- Prompt Model: # data summarizer calls (max [94m[1m10[0m[93m) + [94m[1m8[0m[93m * [94m[1m1[0m[93m lm calls in program = [94m[1m18[0m[93m prompt model calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of calls to prompt model * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).[0m

For a preliminary estimate of potential cos

  1%|          | 10/1401 [00:00<00:03, 395.40it/s]


Bootstrapped 4 full traces after 11 examples in round 0.


  0%|          | 4/1401 [00:00<00:04, 290.78it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


  0%|          | 6/1401 [00:00<00:03, 401.64it/s]


Bootstrapped 4 full traces after 7 examples in round 0.


  0%|          | 4/1401 [00:00<00:03, 376.70it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


  0%|          | 5/1401 [00:00<00:03, 382.45it/s]


Bootstrapped 4 full traces after 6 examples in round 0.


  0%|          | 7/1401 [00:00<00:04, 344.58it/s]


Bootstrapped 4 full traces after 8 examples in round 0.


  1%|          | 8/1401 [00:00<00:03, 401.93it/s]

Bootstrapped 4 full traces after 9 examples in round 0.





In [None]:
eval_fn(mipro_preference_model)



Average Metric: 333 / 500  (66.6): 100%|██████████| 500/500 [00:44<00:00, 11.24it/s]

Average Metric: 333 / 500  (66.6%)





66.6