In [1]:
import dspy
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, MIPRO
from dspy.evaluate import Evaluate
from dspy.teleprompt.ensemble import Ensemble
from datasets import load_dataset
import random
import re
import os

# save all our hard work in the prompts directory
os.makedirs('prompts', exist_ok=True)

random.seed(0)

COMPILE_FS = False
COMPILE_MIPRO = False
COMPILE_COT = False

  from .autonotebook import tqdm as notebook_tqdm


# Why or why not dspy?

The goal of dspy is to extract better performance out of LLMs using prompting.

* Pros:
    * Write code instead of prompts.
    * Data and metric (reward) driven.
    * Write the code once and run with multiple LLMs.
* Cons: 
    * You need a big context size for all this prompting.
    * Nothing is free, you need a lot of calls to evaluate all these prompts. (ideally you would use a batch job and not a notebook)
    * Still in early development, there is refactoring and sometimes the interface is inconsistent.


In [2]:
lm = dspy.HFClientVLLM(model="meta-llama/Meta-Llama-3-8B-Instruct",
                       port=8080,
                       url="http://localhost")
dspy.settings.configure(lm=lm)
NUM_THREADS = 8

# Dataset

* Create a dataset using the dspy.Example class
* We will use the Ultra Feedback dataset
    * 1 instruction
    * 4 possible completions
    * all of them are rated by gpt4
* Our goal is to find a good prompt to get the best RM out of Llama3
* We will create a train, valid, and test dataset
* We will evaluate our model using exact match

In [3]:
dataset = load_dataset("openbmb/UltraFeedback")
all_data = []
for input_ in dataset["train"]:
    completions = sorted(
        input_["completions"], key=lambda x: x["overall_score"], reverse=True
    )
    # only take the top and bottom completions
    for i, chosen_completion in enumerate(completions[:1]):
        for rejected_completion in completions[-1:]:
            if (
                chosen_completion["overall_score"]
                == rejected_completion["overall_score"]
            ):
                continue
            if random.random() < 0.5:
                text1 = chosen_completion["response"]
                text2 = rejected_completion["response"]
                preference = "1"
            else:
                text1 = rejected_completion["response"]
                text2 = chosen_completion["response"]
                preference = "2"
                
            if text1 == text2:
                continue
            # llama 3 has a smart context window
            if len(text1) > 1524 or len(text2) > 1524 or len(input_["instruction"]) > 1524:
                continue
            
            all_data.append(
                dspy.Example(
                    **{
                        "instruction": input_["instruction"],
                        "text_1": text1,
                        "text_2": text2,
                        "preferred_text": preference,
                    }
                ).with_inputs("instruction", "text_1", "text_2")
            )
    if len(all_data) > 2000:
        break

random.shuffle(all_data)

valid = all_data[:100]
test = all_data[100:600]
train = all_data[600:]

  table = cls._concat_blocks(blocks, axis=0)


In [4]:
def extract_pref(example, pred):
    pref_pred = re.search(r"\d+", pred["preferred_text"])
    if pref_pred:
        pref_pred = pref_pred.group()
    else:
        pref_pred = None
    return pref_pred


def em_metric(example, pred, trace=None, frac=1.0, verbose=False):
    pref_pred = extract_pref(example, pred)
    if verbose:
        print(f"Example: {example['preferred_text']}")
        print(f"Prediction: {pref_pred}")
    score = example["preferred_text"] == pref_pred
    if score is None:
        return False
    return score

eval_fn = Evaluate(devset=test, metric=em_metric, num_threads=NUM_THREADS, display_progress=True)

In [5]:
class Preference(dspy.Signature):
    instruction = dspy.InputField()
    text_1 = dspy.InputField()
    text_2 = dspy.InputField()
    preferred_text = dspy.OutputField(desc="Only return the preferred text (1 or 2) as an int", prefix="preferred_text:")

class PrefPredict(dspy.Module):
    def __init__(self):
        super().__init__()
        self.preference = dspy.Predict(
            Preference, max_tokens=3, temperature=0.1
        ) 

    def forward(self, instruction, text_1, text_2, *args, **kwargs):
        preferred = self.preference(
                        instruction=instruction,
            text_1=text_1,
            text_2=text_2,
        )
        return preferred

In [6]:
pred = PrefPredict()(**train[0])
print(pred)
em_metric(train[0], pred, verbose=True)

Prediction(
    preferred_text='1'
)
Example: 1
Prediction: 1


True

In [7]:
lm.inspect_history(n=1)





Given the fields `instruction`, `text_1`, `text_2`, produce the fields `preferred_text`.

---

Follow the following format.

Instruction: ${instruction}

Text 1: ${text_1}

Text 2: ${text_2}

preferred_text: Only return the preferred text (1 or 2) as an int

---

Instruction: For a given set of 10 food-related words, provide the frequency of their usage in a corpus of reviews and the average number of syllables for each word. Furthermore, can you identify the top 3 food-related words with the highest frequency of usage and calculate the standard deviation of their syllable counts across the corpus?

Text 1: Here are the frequencies of the ten food-related words in a corpus of restaurant reviews, along with the average number of syllables: - Pasta: 188 occurrences, 1.5 syllables - Salad: 157 occurrences, 1.7 syllables - Bread: 128 occurrences, 1.5 syllables - Chicken: 154 occurrences, 2.1 syllables - Rice: 115 occurrences, 1.5 syllables - Fish: 121 occurrences, 2.0 syllables - Meat:

In [8]:
eval_fn(PrefPredict())

Average Metric: 324 / 500  (64.8): 100%|██████████| 500/500 [00:00<00:00, 867.75it/s]


Average Metric: 324 / 500  (64.8%)


64.8

# Bootstrapping Few Shot Example with Random Search

* Need a train and a valid set, a metric, and a LLM.
* Does not require demonstration, e.g., no demo CoT will be required.
* The LLM will create good traces that satisfy the metric.
* We will use random search to find the best prompt over generated traces and input/output pairs.

In [9]:
boot_fs = BootstrapFewShotWithRandomSearch(metric=em_metric, max_bootstrapped_demos=4, max_labeled_demos=4,
                                           num_threads=NUM_THREADS, max_rounds=1, num_candidate_programs=50)

if COMPILE_FS:
    preference_model = boot_fs.compile(PrefPredict(), trainset=train, valset=valid)
    ensemble_preference_model = [prog for *_, prog in preference_model.candidate_programs[:3]]
    for idx, prog in enumerate([x[-1] for x in ensemble_preference_model]):
        prog.save(f'prompts/preference_model_{idx}.json')
else:
    ensemble_preference_model = []
    for idx in range(3):
        prog = PrefPredict()
        prog.load(f'prompts/preference_model_{idx}.json')
        ensemble_preference_model.append(prog)

Going to sample between 1 and 4 traces per predictor.
Will attempt to train 50 candidate sets.


In [10]:
eval_fn(ensemble_preference_model[0])

Average Metric: 342 / 500  (68.4): 100%|██████████| 500/500 [00:00<00:00, 774.64it/s]


Average Metric: 342 / 500  (68.4%)


68.4

In [11]:
lm.inspect_history(n=1)





Given the fields `instruction`, `text_1`, `text_2`, produce the fields `preferred_text`.

---

Follow the following format.

Instruction: ${instruction}

Text 1: ${text_1}

Text 2: ${text_2}

preferred_text: Only return the preferred text (1 or 2) as an int

---

Instruction: Take the given text and modify it to make it more interesting. It was a sunny day.

Text 1: The day was radiant with a golden hue, the sun beaming down upon the world as if it was a precious gift.

Text 2: On a bright and radiant afternoon, the sun's rays cast a warm glow over the land, bringing life and joy to all in its path. The birds sang melodies in the trees, the flowers swayed in the gentle breeze, and the grass glistened as it basked in the sun's embrace. It was a day like no other, filled with promise and potential, ready to be savored and enjoyed to its fullest.

preferred_text: 2

---

Instruction: Rewrite the following sentences with more formal and polite language. Hey, what's up?

Text 1: Greetin

# Using an ensemble

In [12]:
ensemble_optimizer = Ensemble(reduce_fn=dspy.majority)
ensemble_preference_model_fn = ensemble_optimizer.compile(ensemble_preference_model)
eval_fn(ensemble_preference_model_fn)

Average Metric: 345 / 500  (69.0): 100%|██████████| 500/500 [00:01<00:00, 314.01it/s]


Average Metric: 345 / 500  (69.0%)


69.0

# Adding Chain of Thought

In [13]:
class CoTPrefPredict(dspy.Module):
    def __init__(self):
        super().__init__()
        self.preference = dspy.ChainOfThought(
            Preference
        ) 

    def forward(self, instruction, text_1, text_2, *args, **kwargs):
        preferred = self.preference(
            instruction=instruction,
            text_1=text_1,
            text_2=text_2,
        )
        return preferred

In [14]:
if COMPILE_COT:
    cot_preference_model = boot_fs.compile(CoTPrefPredict(), trainset=train, valset=valid)
    ensemble_cot_preference_model = []
    for idx, prog in enumerate([x[-1] for x in cot_preference_model.candidate_programs[:3]]):
        prog.save(f'prompts/cot_preference_model_{idx}.json')
        ensemble_cot_preference_model.append(prog)
else:
    ensemble_cot_preference_model = []
    for idx in range(3):
        prog = CoTPrefPredict()
        prog.load(f'prompts/cot_preference_model_{idx}.json')
        ensemble_cot_preference_model.append(prog)
    

In [15]:
eval_fn(ensemble_cot_preference_model[0])

Average Metric: 352 / 500  (70.4): 100%|██████████| 500/500 [00:00<00:00, 633.95it/s]

Average Metric: 352 / 500  (70.4%)





70.4

# MIPRO (Multi-prompt Instruction Proposal Optimizer)

https://twitter.com/kristahopsalong/status/1766166198079889737

* Takes a teacher and a student LLMs, a dataset, and a metric.
* Multi stage optimization
    * The teacher looks at inputs and outputs and summarize the data
    * The student generates good traces for a few input where the metric is validated
    * The teacher create instruction given the good traces and the summary of the data
    * Use some kind of Bayes Optimization to search over instructions and examples


In [17]:
if COMPILE_MIPRO:
    mipro = MIPRO(metric=em_metric, init_temperature=1.0, num_candidates=8)
    kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=4)
    mipro_preference_model = mipro.compile(student=PrefPredict(), trainset=train, num_trials=20, max_bootstrapped_demos=4, 
                                            max_labeled_demos=4, eval_kwargs=kwargs, requires_permission_to_run=False)
    mipro_preference_model.save("prompts/mipro_preference_model.json")
else:
    pass



Please be advised that based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Task Model: [94m[1m1401[0m[93m examples in dev set * [94m[1m20[0m[93m trials * [94m[1m# of LM calls in your program[0m[93m = ([94m[1m28020 * # of LM calls in your program[0m[93m) task model calls[0m
[93m- Prompt Model: # data summarizer calls (max [94m[1m10[0m[93m) + [94m[1m8[0m[93m * [94m[1m1[0m[93m lm calls in program = [94m[1m18[0m[93m prompt model calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of calls to prompt model * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).[0m

For a preliminary estimate of potential cos

  1%|          | 10/1401 [00:00<00:01, 722.42it/s]


Bootstrapped 4 full traces after 11 examples in round 0.


  0%|          | 4/1401 [00:00<00:01, 703.62it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


  0%|          | 6/1401 [00:00<00:01, 756.82it/s]


Bootstrapped 4 full traces after 7 examples in round 0.


  0%|          | 4/1401 [00:00<00:01, 766.01it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


  0%|          | 5/1401 [00:00<00:01, 766.95it/s]


Bootstrapped 4 full traces after 6 examples in round 0.


  0%|          | 7/1401 [00:00<00:01, 730.68it/s]


Bootstrapped 4 full traces after 8 examples in round 0.


  1%|          | 8/1401 [00:00<00:01, 739.51it/s]


Bootstrapped 4 full traces after 9 examples in round 0.


[I 2024-05-02 16:36:14,415] A new study created in memory with name: no-name-cdfbd658-b54f-4702-945d-1fd524677c41


Starting trial #0


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:07<00:00, 13.50it/s]


Average Metric: 65 / 100  (65.0%)


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:07<00:00, 13.91it/s]


Average Metric: 66 / 100  (66.0%)


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:07<00:00, 13.79it/s]


Average Metric: 71 / 100  (71.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:07<00:00, 13.85it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:07<00:00, 14.11it/s]


Average Metric: 66 / 100  (66.0%)


Average Metric: 62 / 100  (62.0): 100%|██████████| 100/100 [00:07<00:00, 13.99it/s]


Average Metric: 62 / 100  (62.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:07<00:00, 13.89it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:07<00:00, 13.75it/s]


Average Metric: 72 / 100  (72.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:07<00:00, 14.15it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:07<00:00, 14.01it/s]


Average Metric: 66 / 100  (66.0%)


Average Metric: 64 / 100  (64.0): 100%|██████████| 100/100 [00:07<00:00, 13.74it/s]


Average Metric: 64 / 100  (64.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:07<00:00, 13.92it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:07<00:00, 13.88it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:08<00:00, 11.21it/s]


Average Metric: 63 / 100  (63.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 11.70it/s]
[I 2024-05-02 16:37:57,350] Trial 0 finished with value: 67.38044254104211 and parameters: {'140734164980576_predictor_instruction': 1, '140734164980576_predictor_demos': 3}. Best is trial 0 with value: 67.38044254104211.


Average Metric: 1 / 1  (100.0%)
Starting trial #1


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:07<00:00, 12.78it/s]


Average Metric: 54 / 100  (54.0%)


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:07<00:00, 13.03it/s]


Average Metric: 56 / 100  (56.0%)


Average Metric: 48 / 100  (48.0): 100%|██████████| 100/100 [00:06<00:00, 14.44it/s]


Average Metric: 48 / 100  (48.0%)


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:06<00:00, 14.38it/s]


Average Metric: 54 / 100  (54.0%)


Average Metric: 45 / 100  (45.0): 100%|██████████| 100/100 [00:07<00:00, 13.94it/s]


Average Metric: 45 / 100  (45.0%)


Average Metric: 50 / 100  (50.0): 100%|██████████| 100/100 [00:07<00:00, 14.17it/s]


Average Metric: 50 / 100  (50.0%)


Average Metric: 51 / 100  (51.0): 100%|██████████| 100/100 [00:07<00:00, 13.80it/s]


Average Metric: 51 / 100  (51.0%)


Average Metric: 53 / 100  (53.0): 100%|██████████| 100/100 [00:07<00:00, 13.94it/s]


Average Metric: 53 / 100  (53.0%)


Average Metric: 52 / 100  (52.0): 100%|██████████| 100/100 [00:08<00:00, 12.01it/s]


Average Metric: 52 / 100  (52.0%)


Average Metric: 58 / 100  (58.0): 100%|██████████| 100/100 [00:08<00:00, 12.07it/s]


Average Metric: 58 / 100  (58.0%)


Average Metric: 53 / 100  (53.0): 100%|██████████| 100/100 [00:07<00:00, 12.88it/s]


Average Metric: 53 / 100  (53.0%)


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:06<00:00, 14.88it/s]


Average Metric: 56 / 100  (56.0%)


Average Metric: 44 / 100  (44.0): 100%|██████████| 100/100 [00:07<00:00, 12.88it/s]


Average Metric: 44 / 100  (44.0%)


Average Metric: 53 / 100  (53.0): 100%|██████████| 100/100 [00:07<00:00, 13.42it/s]


Average Metric: 53 / 100  (53.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 11.67it/s]
[I 2024-05-02 16:39:42,061] Trial 1 finished with value: 51.962883654532476 and parameters: {'140734164980576_predictor_instruction': 4, '140734164980576_predictor_demos': 1}. Best is trial 0 with value: 67.38044254104211.


Average Metric: 1 / 1  (100.0%)
Starting trial #2


Average Metric: 53 / 100  (53.0): 100%|██████████| 100/100 [00:06<00:00, 15.14it/s]


Average Metric: 53 / 100  (53.0%)


Average Metric: 51 / 100  (51.0): 100%|██████████| 100/100 [00:06<00:00, 15.23it/s]


Average Metric: 51 / 100  (51.0%)


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:06<00:00, 15.56it/s]


Average Metric: 61 / 100  (61.0%)


Average Metric: 58 / 100  (58.0): 100%|██████████| 100/100 [00:07<00:00, 13.25it/s]


Average Metric: 58 / 100  (58.0%)


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:07<00:00, 12.81it/s]


Average Metric: 61 / 100  (61.0%)


Average Metric: 50 / 100  (50.0): 100%|██████████| 100/100 [00:07<00:00, 13.90it/s]


Average Metric: 50 / 100  (50.0%)


Average Metric: 57 / 100  (57.0): 100%|██████████| 100/100 [00:08<00:00, 12.02it/s]


Average Metric: 57 / 100  (57.0%)


Average Metric: 58 / 100  (58.0): 100%|██████████| 100/100 [00:07<00:00, 14.03it/s]


Average Metric: 58 / 100  (58.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:07<00:00, 12.86it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 51 / 100  (51.0): 100%|██████████| 100/100 [00:06<00:00, 14.62it/s]


Average Metric: 51 / 100  (51.0%)


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:06<00:00, 15.24it/s]


Average Metric: 54 / 100  (54.0%)


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:06<00:00, 15.10it/s]


Average Metric: 56 / 100  (56.0%)


Average Metric: 58 / 100  (58.0): 100%|██████████| 100/100 [00:07<00:00, 14.17it/s]


Average Metric: 58 / 100  (58.0%)


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:07<00:00, 12.72it/s]


Average Metric: 56 / 100  (56.0%)


Average Metric: 0 / 1  (0.0): 100%|██████████| 1/1 [00:00<00:00, 12.34it/s]
[I 2024-05-02 16:41:22,732] Trial 2 finished with value: 55.96002855103497 and parameters: {'140734164980576_predictor_instruction': 2, '140734164980576_predictor_demos': 3}. Best is trial 0 with value: 67.38044254104211.


Average Metric: 0 / 1  (0.0%)
Starting trial #3


Average Metric: 47 / 100  (47.0): 100%|██████████| 100/100 [00:06<00:00, 16.63it/s]


Average Metric: 47 / 100  (47.0%)


Average Metric: 45 / 100  (45.0): 100%|██████████| 100/100 [00:06<00:00, 16.18it/s]


Average Metric: 45 / 100  (45.0%)


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:07<00:00, 14.17it/s]


Average Metric: 54 / 100  (54.0%)


Average Metric: 52 / 100  (52.0): 100%|██████████| 100/100 [00:06<00:00, 15.77it/s]


Average Metric: 52 / 100  (52.0%)


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:06<00:00, 16.12it/s]


Average Metric: 56 / 100  (56.0%)


Average Metric: 51 / 100  (51.0): 100%|██████████| 100/100 [00:06<00:00, 16.20it/s]


Average Metric: 51 / 100  (51.0%)


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:07<00:00, 13.99it/s]


Average Metric: 54 / 100  (54.0%)


Average Metric: 56 / 100  (56.0): 100%|██████████| 100/100 [00:05<00:00, 19.61it/s]


Average Metric: 56 / 100  (56.0%)


Average Metric: 55 / 100  (55.0): 100%|██████████| 100/100 [00:05<00:00, 17.50it/s]


Average Metric: 55 / 100  (55.0%)


Average Metric: 49 / 100  (49.0): 100%|██████████| 100/100 [00:05<00:00, 18.92it/s]


Average Metric: 49 / 100  (49.0%)


Average Metric: 57 / 100  (57.0): 100%|██████████| 100/100 [00:06<00:00, 16.48it/s]


Average Metric: 57 / 100  (57.0%)


Average Metric: 52 / 100  (52.0): 100%|██████████| 100/100 [00:06<00:00, 14.69it/s]


Average Metric: 52 / 100  (52.0%)


Average Metric: 59 / 100  (59.0): 100%|██████████| 100/100 [00:05<00:00, 17.42it/s]


Average Metric: 59 / 100  (59.0%)


Average Metric: 59 / 100  (59.0): 100%|██████████| 100/100 [00:05<00:00, 17.08it/s]


Average Metric: 59 / 100  (59.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 13.96it/s]
[I 2024-05-02 16:42:48,757] Trial 3 finished with value: 53.319057815845824 and parameters: {'140734164980576_predictor_instruction': 2, '140734164980576_predictor_demos': 6}. Best is trial 0 with value: 67.38044254104211.


Average Metric: 1 / 1  (100.0%)
Starting trial #4


Average Metric: 43 / 100  (43.0): 100%|██████████| 100/100 [00:09<00:00, 10.97it/s]


Average Metric: 43 / 100  (43.0%)


Average Metric: 38 / 100  (38.0): 100%|██████████| 100/100 [00:08<00:00, 12.32it/s]


Average Metric: 38 / 100  (38.0%)


Average Metric: 57 / 100  (57.0): 100%|██████████| 100/100 [00:08<00:00, 11.60it/s]


Average Metric: 57 / 100  (57.0%)


Average Metric: 53 / 100  (53.0): 100%|██████████| 100/100 [00:08<00:00, 11.93it/s]


Average Metric: 53 / 100  (53.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:07<00:00, 12.92it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 46 / 100  (46.0): 100%|██████████| 100/100 [00:07<00:00, 13.18it/s]


Average Metric: 46 / 100  (46.0%)


Average Metric: 52 / 100  (52.0): 100%|██████████| 100/100 [01:25<00:00,  1.17it/s]


Average Metric: 52 / 100  (52.0%)


Average Metric: 51 / 100  (51.0): 100%|██████████| 100/100 [00:07<00:00, 12.89it/s]


Average Metric: 51 / 100  (51.0%)


Average Metric: 50 / 100  (50.0): 100%|██████████| 100/100 [00:07<00:00, 13.28it/s]


Average Metric: 50 / 100  (50.0%)


Average Metric: 40 / 100  (40.0): 100%|██████████| 100/100 [00:07<00:00, 13.28it/s]


Average Metric: 40 / 100  (40.0%)


Average Metric: 45 / 100  (45.0): 100%|██████████| 100/100 [00:07<00:00, 13.34it/s]


Average Metric: 45 / 100  (45.0%)


Average Metric: 51 / 100  (51.0): 100%|██████████| 100/100 [00:07<00:00, 13.31it/s]


Average Metric: 51 / 100  (51.0%)


Average Metric: 53 / 100  (53.0): 100%|██████████| 100/100 [00:07<00:00, 13.29it/s]


Average Metric: 53 / 100  (53.0%)


Average Metric: 49 / 100  (49.0): 100%|██████████| 100/100 [00:07<00:00, 13.04it/s]


Average Metric: 49 / 100  (49.0%)


Average Metric: 0 / 1  (0.0): 100%|██████████| 1/1 [00:00<00:00, 10.77it/s]
[I 2024-05-02 16:45:57,136] Trial 4 finished with value: 49.10778015703069 and parameters: {'140734164980576_predictor_instruction': 5, '140734164980576_predictor_demos': 1}. Best is trial 0 with value: 67.38044254104211.


Average Metric: 0 / 1  (0.0%)
Starting trial #5


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:01<00:00, 75.14it/s]


Average Metric: 63 / 100  (63.0%)


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:01<00:00, 76.35it/s]


Average Metric: 61 / 100  (61.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:01<00:00, 72.05it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:01<00:00, 76.54it/s]


Average Metric: 72 / 100  (72.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:01<00:00, 77.10it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:01<00:00, 71.33it/s]


Average Metric: 54 / 100  (54.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:01<00:00, 59.86it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:01<00:00, 77.32it/s]


Average Metric: 70 / 100  (70.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:01<00:00, 75.82it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 59 / 100  (59.0): 100%|██████████| 100/100 [00:01<00:00, 81.25it/s]


Average Metric: 59 / 100  (59.0%)


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:01<00:00, 66.54it/s]


Average Metric: 63 / 100  (63.0%)


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:01<00:00, 76.57it/s]


Average Metric: 66 / 100  (66.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:01<00:00, 73.06it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:01<00:00, 72.92it/s]


Average Metric: 65 / 100  (65.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 25.13it/s]
[I 2024-05-02 16:46:16,524] Trial 5 finished with value: 64.09707351891507 and parameters: {'140734164980576_predictor_instruction': 0, '140734164980576_predictor_demos': 0}. Best is trial 0 with value: 67.38044254104211.


Average Metric: 1 / 1  (100.0%)
Starting trial #6


Average Metric: 0 / 100  (0.0): 100%|██████████| 100/100 [00:02<00:00, 41.84it/s]
[I 2024-05-02 16:46:18,935] Trial 6 pruned. 


Average Metric: 0 / 100  (0.0%)
Trial pruned.
Starting trial #7


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:06<00:00, 15.88it/s]


Average Metric: 66 / 100  (66.0%)


Average Metric: 78 / 100  (78.0): 100%|██████████| 100/100 [00:06<00:00, 16.51it/s]


Average Metric: 78 / 100  (78.0%)


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:06<00:00, 16.27it/s]


Average Metric: 71 / 100  (71.0%)


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:06<00:00, 16.26it/s]


Average Metric: 72 / 100  (72.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:06<00:00, 15.55it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:05<00:00, 16.75it/s]


Average Metric: 66 / 100  (66.0%)


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:06<00:00, 15.74it/s]


Average Metric: 71 / 100  (71.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:06<00:00, 16.54it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:06<00:00, 16.30it/s]


Average Metric: 72 / 100  (72.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:06<00:00, 16.22it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:06<00:00, 16.55it/s]


Average Metric: 71 / 100  (71.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:06<00:00, 16.21it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:06<00:00, 16.29it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 62 / 100  (62.0): 100%|██████████| 100/100 [00:05<00:00, 16.69it/s]


Average Metric: 62 / 100  (62.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 13.44it/s]
[I 2024-05-02 16:47:45,451] Trial 7 finished with value: 70.378301213419 and parameters: {'140734164980576_predictor_instruction': 0, '140734164980576_predictor_demos': 1}. Best is trial 7 with value: 70.378301213419.


Average Metric: 1 / 1  (100.0%)
Starting trial #8


Average Metric: 46 / 100  (46.0): 100%|██████████| 100/100 [00:06<00:00, 14.43it/s]
[I 2024-05-02 16:47:52,403] Trial 8 pruned. 


Average Metric: 46 / 100  (46.0%)
Trial pruned.
Starting trial #9


Average Metric: 48 / 100  (48.0): 100%|██████████| 100/100 [00:11<00:00,  8.93it/s]
[I 2024-05-02 16:48:03,629] Trial 9 pruned. 


Average Metric: 48 / 100  (48.0%)
Trial pruned.
Starting trial #10


Average Metric: 46 / 100  (46.0): 100%|██████████| 100/100 [00:07<00:00, 13.59it/s]
[I 2024-05-02 16:48:11,016] Trial 10 pruned. 


Average Metric: 46 / 100  (46.0%)
Trial pruned.
Starting trial #11


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:06<00:00, 15.04it/s]


Average Metric: 63 / 100  (63.0%)


Average Metric: 71 / 100  (71.0): 100%|██████████| 100/100 [00:06<00:00, 14.97it/s]


Average Metric: 71 / 100  (71.0%)


Average Metric: 62 / 100  (62.0): 100%|██████████| 100/100 [00:06<00:00, 15.31it/s]


Average Metric: 62 / 100  (62.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:06<00:00, 15.81it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:06<00:00, 15.64it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 54 / 100  (54.0): 100%|██████████| 100/100 [00:06<00:00, 14.80it/s]


Average Metric: 54 / 100  (54.0%)


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:07<00:00, 13.21it/s]


Average Metric: 65 / 100  (65.0%)


Average Metric: 65 / 100  (65.0): 100%|██████████| 100/100 [00:07<00:00, 13.83it/s]


Average Metric: 65 / 100  (65.0%)


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:07<00:00, 12.59it/s]


Average Metric: 70 / 100  (70.0%)


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:07<00:00, 14.28it/s]


Average Metric: 72 / 100  (72.0%)


Average Metric: 63 / 100  (63.0): 100%|██████████| 100/100 [00:07<00:00, 13.54it/s]


Average Metric: 63 / 100  (63.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:06<00:00, 14.48it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 59 / 100  (59.0): 100%|██████████| 100/100 [00:06<00:00, 15.62it/s]


Average Metric: 59 / 100  (59.0%)


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:06<00:00, 15.62it/s]


Average Metric: 66 / 100  (66.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 12.71it/s]
[I 2024-05-02 16:49:47,549] Trial 11 finished with value: 64.66809421841542 and parameters: {'140734164980576_predictor_instruction': 1, '140734164980576_predictor_demos': 4}. Best is trial 7 with value: 70.378301213419.


Average Metric: 1 / 1  (100.0%)
Starting trial #12


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:07<00:00, 13.55it/s]


Average Metric: 70 / 100  (70.0%)


Average Metric: 84 / 100  (84.0): 100%|██████████| 100/100 [00:07<00:00, 14.04it/s]


Average Metric: 84 / 100  (84.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:07<00:00, 14.06it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:07<00:00, 14.07it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:07<00:00, 14.11it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:06<00:00, 14.30it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:09<00:00, 10.93it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:07<00:00, 13.64it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:08<00:00, 12.37it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:08<00:00, 12.20it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:07<00:00, 12.69it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:06<00:00, 14.74it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:07<00:00, 13.95it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:06<00:00, 14.57it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 11.83it/s]
[I 2024-05-02 16:51:32,184] Trial 12 finished with value: 71.59172019985725 and parameters: {'140734164980576_predictor_instruction': 0, '140734164980576_predictor_demos': 7}. Best is trial 12 with value: 71.59172019985725.


Average Metric: 1 / 1  (100.0%)
Starting trial #13


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:00<00:00, 728.24it/s]


Average Metric: 70 / 100  (70.0%)


Average Metric: 84 / 100  (84.0): 100%|██████████| 100/100 [00:00<00:00, 726.07it/s]


Average Metric: 84 / 100  (84.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 724.95it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 736.14it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:00<00:00, 710.33it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:00<00:00, 748.62it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:00<00:00, 727.32it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:00<00:00, 745.45it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 737.70it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 742.24it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 728.09it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:00<00:00, 736.05it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:00<00:00, 699.08it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:00<00:00, 752.01it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 349.15it/s]
[I 2024-05-02 16:51:34,298] Trial 13 finished with value: 71.59172019985725 and parameters: {'140734164980576_predictor_instruction': 0, '140734164980576_predictor_demos': 7}. Best is trial 12 with value: 71.59172019985725.


Average Metric: 1 / 1  (100.0%)
Starting trial #14


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:00<00:00, 767.09it/s]


Average Metric: 70 / 100  (70.0%)


Average Metric: 84 / 100  (84.0): 100%|██████████| 100/100 [00:00<00:00, 783.90it/s]


Average Metric: 84 / 100  (84.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 808.50it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 783.50it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:00<00:00, 793.97it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:00<00:00, 789.16it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:00<00:00, 811.85it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:00<00:00, 805.12it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 809.14it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 790.47it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 811.06it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:00<00:00, 800.62it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:00<00:00, 793.97it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:00<00:00, 784.56it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 636.37it/s]
[I 2024-05-02 16:51:36,289] Trial 14 finished with value: 71.59172019985725 and parameters: {'140734164980576_predictor_instruction': 0, '140734164980576_predictor_demos': 7}. Best is trial 12 with value: 71.59172019985725.


Average Metric: 1 / 1  (100.0%)
Starting trial #15


Average Metric: 58 / 100  (58.0): 100%|██████████| 100/100 [00:08<00:00, 11.46it/s]
[I 2024-05-02 16:51:45,039] Trial 15 pruned. 


Average Metric: 58 / 100  (58.0%)
Trial pruned.
Starting trial #16


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:00<00:00, 799.09it/s]


Average Metric: 70 / 100  (70.0%)


Average Metric: 84 / 100  (84.0): 100%|██████████| 100/100 [00:00<00:00, 787.51it/s]


Average Metric: 84 / 100  (84.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 785.14it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 765.94it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:00<00:00, 785.84it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:00<00:00, 777.27it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:00<00:00, 732.71it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:00<00:00, 793.11it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 780.22it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 756.78it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 806.43it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:00<00:00, 803.87it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:00<00:00, 787.38it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:00<00:00, 806.33it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 427.64it/s]
[I 2024-05-02 16:51:47,168] Trial 16 finished with value: 71.59172019985725 and parameters: {'140734164980576_predictor_instruction': 0, '140734164980576_predictor_demos': 7}. Best is trial 12 with value: 71.59172019985725.


Average Metric: 1 / 1  (100.0%)
Starting trial #17


Average Metric: 70 / 100  (70.0): 100%|██████████| 100/100 [00:00<00:00, 780.70it/s]


Average Metric: 70 / 100  (70.0%)


Average Metric: 84 / 100  (84.0): 100%|██████████| 100/100 [00:00<00:00, 806.87it/s]


Average Metric: 84 / 100  (84.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 826.89it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 825.68it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 69 / 100  (69.0): 100%|██████████| 100/100 [00:00<00:00, 825.39it/s]


Average Metric: 69 / 100  (69.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:00<00:00, 823.95it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:00<00:00, 802.24it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:00<00:00, 774.87it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 805.61it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 74 / 100  (74.0): 100%|██████████| 100/100 [00:00<00:00, 806.12it/s]


Average Metric: 74 / 100  (74.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:00<00:00, 803.54it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:00<00:00, 784.16it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 60 / 100  (60.0): 100%|██████████| 100/100 [00:00<00:00, 782.90it/s]


Average Metric: 60 / 100  (60.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:00<00:00, 728.75it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 710.42it/s]
[I 2024-05-02 16:51:49,142] Trial 17 finished with value: 71.59172019985725 and parameters: {'140734164980576_predictor_instruction': 0, '140734164980576_predictor_demos': 7}. Best is trial 12 with value: 71.59172019985725.


Average Metric: 1 / 1  (100.0%)
Starting trial #18


Average Metric: 53 / 100  (53.0): 100%|██████████| 100/100 [00:08<00:00, 11.97it/s]
[I 2024-05-02 16:51:57,517] Trial 18 pruned. 


Average Metric: 53 / 100  (53.0%)
Trial pruned.
Starting trial #19


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:06<00:00, 15.56it/s]


Average Metric: 72 / 100  (72.0%)


Average Metric: 76 / 100  (76.0): 100%|██████████| 100/100 [00:06<00:00, 16.64it/s]


Average Metric: 76 / 100  (76.0%)


Average Metric: 73 / 100  (73.0): 100%|██████████| 100/100 [00:06<00:00, 16.57it/s]


Average Metric: 73 / 100  (73.0%)


Average Metric: 76 / 100  (76.0): 100%|██████████| 100/100 [00:06<00:00, 16.58it/s]


Average Metric: 76 / 100  (76.0%)


Average Metric: 67 / 100  (67.0): 100%|██████████| 100/100 [00:06<00:00, 15.90it/s]


Average Metric: 67 / 100  (67.0%)


Average Metric: 61 / 100  (61.0): 100%|██████████| 100/100 [00:05<00:00, 16.86it/s]


Average Metric: 61 / 100  (61.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:06<00:00, 16.04it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 76 / 100  (76.0): 100%|██████████| 100/100 [00:06<00:00, 16.00it/s]


Average Metric: 76 / 100  (76.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:06<00:00, 15.77it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:06<00:00, 16.38it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 66 / 100  (66.0): 100%|██████████| 100/100 [00:06<00:00, 16.55it/s]


Average Metric: 66 / 100  (66.0%)


Average Metric: 72 / 100  (72.0): 100%|██████████| 100/100 [00:06<00:00, 16.35it/s]


Average Metric: 72 / 100  (72.0%)


Average Metric: 68 / 100  (68.0): 100%|██████████| 100/100 [00:06<00:00, 16.57it/s]


Average Metric: 68 / 100  (68.0%)


Average Metric: 75 / 100  (75.0): 100%|██████████| 100/100 [00:06<00:00, 15.77it/s]


Average Metric: 75 / 100  (75.0%)


Average Metric: 1 / 1  (100.0): 100%|██████████| 1/1 [00:00<00:00, 12.96it/s]
[I 2024-05-02 16:53:24,067] Trial 19 finished with value: 71.94860813704497 and parameters: {'140734164980576_predictor_instruction': 3, '140734164980576_predictor_demos': 6}. Best is trial 19 with value: 71.94860813704497.


Average Metric: 1 / 1  (100.0%)
Returning preference = Predict(StringSignature(instruction, text_1, text_2 -> preferred_text
    instructions='Produce a coherent and well-structured response by combining the provided text fragments, using the necessary punctuation and grammar to ensure clarity and readability, and tailoring the tone and language to the target audience. Use any relevant data formats or specifications to support your answer.'
    instruction = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Instruction:', 'desc': '${instruction}'})
    text_1 = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Text 1:', 'desc': '${text_1}'})
    text_2 = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Text 2:', 'desc': '${text_2}'})
    preferred_text = Field(annotation=str required=True json_schema_extra={'desc': 'Only return the preferred text (1 or

In [29]:
eval_fn(mipro_preference_model)

Average Metric: 346 / 500  (69.2): 100%|██████████| 500/500 [00:00<00:00, 556.51it/s]


Average Metric: 346 / 500  (69.2%)


69.2

In [27]:
prog = PrefPredict()
mipro_preference_model2 = prog.load('prompts/mipro_preference_model.json')

In [25]:
eval_fn(mipro_preference_model)

TypeError: BaseModule.load_state() missing 1 required positional argument: 'state'