В этом ноутбуке представлен эксперимент с целью проверки значимости БО в задаче APO

In [None]:
!pip install --root-user-action=ignore --upgrade pip
!pip install --root-user-action=ignore scikit-optimize transformers datasets accelerate
!pip install --root-user-action=ignore ipywidgets

In [4]:
import time
import torch
import random
import numpy as np
from sklearn.metrics import f1_score
from skopt import gp_minimize
from skopt.space import Integer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from accelerate import Accelerator
from itertools import chain

In [5]:
accelerator = Accelerator()
device = accelerator.device
print(device)

cuda


In [None]:
model_name = "Qwen/Qwen3-0.6B"
# model_name = "distilbert-base-uncased-finetuned-sst-2-english"  # 66M параметров
# model_name = "facebook/opt-6.7b"  # 6.7B параметров
# model_name = "EleutherAI/gpt-j-6B"  # 6B параметров

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

In [34]:
VOCAB_SIZE = tokenizer.vocab_size
ARR_PROMPT_LENGTH = [5, 10, 15, 20]
ARR_NUM_ITERATIONS = [20, 50, 70, 100]
NUM_POINTS = 5
# ARR_PROMPT_LENGTH = [5, 10]
# ARR_NUM_ITERATIONS = [5, 10]
# NUM_POINTS = 1

In [None]:
dataset = load_dataset("imdb")
test_dataset = dataset["test"].select(chain(range(100), range(24900, 25000)))

In [11]:
ARR_TIME = []
ARR_BEST_PROMPT = []
ARR_BEST_SCORE = []

best_prompt = ""
max_score = -1

base_prompt = "Review:"

def evaluate_prompt(prompt_indices):
    global ARR_TIME, ARR_BEST_PROMPT, ARR_BEST_SCORE, best_prompt, max_score, base_prompt
    
    prompt_tokens = tokenizer.convert_ids_to_tokens(prompt_indices)
    prompt_text = tokenizer.convert_tokens_to_string(prompt_tokens)
    full_prompt = prompt_text + " " + base_prompt

    all_preds = []
    all_labels = []

    for example in test_dataset:
        text = full_prompt + " " + example["text"]
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        pred = outputs.logits.argmax(-1).item()
        all_preds.append(pred)
        all_labels.append(example["label"])

    ############### batch version ###############

    # batch_size = 16
    # for i in range(0, len(test_dataset), batch_size):
    #     batch = test_dataset[i:min(i+batch_size, len(test_dataset))]
    #     texts = [full_prompt + " " + item for item in batch["text"]]
    #     labels = [item for item in batch["label"]]

    #     inputs = tokenizer(
    #         texts,
    #         return_tensors="pt",
    #         padding=True,
    #         truncation=True,
    #         max_length=512
    #     ).to(device)

    #     with torch.no_grad():
    #         outputs = model(**inputs)

    #     preds = outputs.logits.argmax(1).cpu().numpy()
    #     all_preds.extend(preds)
    #     all_labels.extend(labels)
    
    #############################################

    score = f1_score(all_labels, all_preds)

    if score > max_score:
        best_prompt = full_prompt
        max_score = score
    
    ARR_BEST_PROMPT.append(best_prompt)
    ARR_BEST_SCORE.append(max_score)
    
    ARR_TIME.append(time.time())

    return score

In [12]:
# Эксперимент 1: Без оптимизации
start_time = time.time()
base_f1 = evaluate_prompt([0] * 5)
base_time = time.time() - start_time
print(f"Результат: F1 = {base_f1:.4f}, Время: {base_time:.2f} сек")

Результат: F1 = 0.3625, Время: 12.47 сек


In [13]:
# Целевая функция для оптимизации
def objective(prompt_indices):
    score = evaluate_prompt(prompt_indices)
    return -score

In [14]:
# Эксперимент c оптимизацией
RES_TIME = dict()
RES_PROMPT = dict()
RES_SCORE = dict()

MAX_NUM_ITERATIONS = max(ARR_NUM_ITERATIONS)

for PROMPT_LENGTH in ARR_PROMPT_LENGTH:
    RES_TIME[PROMPT_LENGTH] = dict()
    RES_PROMPT[PROMPT_LENGTH] = dict()
    RES_SCORE[PROMPT_LENGTH] = dict()

    RES_PROMPT[PROMPT_LENGTH][0] = ""
    for NUM_ITERATIONS in ARR_NUM_ITERATIONS:
        RES_PROMPT[PROMPT_LENGTH][NUM_ITERATIONS] = ""

        RES_TIME[PROMPT_LENGTH][NUM_ITERATIONS] = 0
        RES_SCORE[PROMPT_LENGTH][NUM_ITERATIONS] = 0

    # Пространство поиска
    space = [Integer(0, VOCAB_SIZE-1) for _ in range(PROMPT_LENGTH)]

    for i in range(NUM_POINTS):
        ARR_TIME = []
        ARR_BEST_PROMPT = []
        ARR_BEST_SCORE = []

        best_prompt = ""
        max_score = -1

        start_time = time.time()

        res = gp_minimize(
            func=objective,
            dimensions=space,
            n_calls=MAX_NUM_ITERATIONS,
            n_initial_points=10,
            random_state=random.randint(0, 1000),
            verbose=True
        )
        
        RES_PROMPT[PROMPT_LENGTH][0] = ARR_BEST_PROMPT[0]
        for NUM_ITERATIONS in ARR_NUM_ITERATIONS:
            RES_PROMPT[PROMPT_LENGTH][NUM_ITERATIONS] = ARR_BEST_PROMPT[NUM_ITERATIONS - 1]

            RES_TIME[PROMPT_LENGTH][NUM_ITERATIONS] += ARR_TIME[NUM_ITERATIONS - 1] - start_time
            RES_SCORE[PROMPT_LENGTH][NUM_ITERATIONS] += ARR_BEST_SCORE[NUM_ITERATIONS - 1]

    for NUM_ITERATIONS in ARR_NUM_ITERATIONS:
        RES_TIME[PROMPT_LENGTH][NUM_ITERATIONS] *= 1.0 / NUM_POINTS
        RES_SCORE[PROMPT_LENGTH][NUM_ITERATIONS] *= 1.0 / NUM_POINTS

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 12.0733
Function value obtained: -0.3077
Current minimum: -0.3077
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 12.0003
Function value obtained: -0.3448
Current minimum: -0.3448
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 11.9703
Function value obtained: -0.2609
Current minimum: -0.3448
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 11.9238
Function value obtained: -0.3750
Current minimum: -0.3750
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 11.9368
Function value obtained: -0.2556
Current minimum: -0.3750
Iteration No: 6 star

In [15]:
print("Зависимость времени (с) оптимизации от количества итераций с определенной длиной промпта:")

print(f"{'Prompt len / Iterations':<23}", end='')
for NUM_ITERATIONS in ARR_NUM_ITERATIONS:
    print(f" | {NUM_ITERATIONS:<10}", end='')

print("\n" + "-" * 80)

for PROMPT_LENGTH in ARR_PROMPT_LENGTH:
    print(f"{PROMPT_LENGTH:<23}", end='')
    for NUM_ITERATIONS in ARR_NUM_ITERATIONS:
        print(f" | {RES_TIME[PROMPT_LENGTH][NUM_ITERATIONS]:<10.2f}", end='')
    print()

Зависимость времени (с) оптимизации от количества итераций с определенной длиной промпта:
Prompt len / Iterations | 20         | 50         | 70         | 100       
--------------------------------------------------------------------------------
5                       | 244.21     | 624.23     | 877.89     | 1265.93   
10                      | 246.89     | 635.29     | 902.71     | 1319.72   
15                      | 250.44     | 640.91     | 907.97     | 1354.13   
20                      | 254.62     | 654.50     | 927.58     | 1407.92   


In [16]:
print("Зависимость метрики F1-score от количества итераций с определенной длиной промпта:")

print(f"{'Prompt len / Iterations':<23}", end='')
for NUM_ITERATIONS in ARR_NUM_ITERATIONS:
    print(f" | {NUM_ITERATIONS:<10}", end='')

print("\n" + "-" * 80)

for PROMPT_LENGTH in ARR_PROMPT_LENGTH:
    print(f"{PROMPT_LENGTH:<23}", end='')
    for NUM_ITERATIONS in ARR_NUM_ITERATIONS:
        print(f" | {RES_SCORE[PROMPT_LENGTH][NUM_ITERATIONS]:<10.4f}", end='')
    print()

Зависимость метрики F1-score от количества итераций с определенной длиной промпта:
Prompt len / Iterations | 20         | 50         | 70         | 100       
--------------------------------------------------------------------------------
5                       | 0.4396     | 0.4900     | 0.4900     | 0.4900    
10                      | 0.3879     | 0.3879     | 0.3879     | 0.4242    
15                      | 0.4481     | 0.4481     | 0.4481     | 0.4481    
20                      | 0.4455     | 0.4455     | 0.4455     | 0.4505    


In [17]:
print("Зависимость итогового промпта, с определенной длиной, от количества итераций:")

for PROMPT_LENGTH in ARR_PROMPT_LENGTH:
    print(f"Prompt with length {PROMPT_LENGTH}:")
    print(f"Base prompt - {RES_PROMPT[PROMPT_LENGTH][0]}")
    print("-" * 80)
    for NUM_ITERATIONS in ARR_NUM_ITERATIONS:
        print(f"Best prompt after {NUM_ITERATIONS} iterations - {RES_PROMPT[PROMPT_LENGTH][NUM_ITERATIONS]}")
    print("-" * 80 + "\n")

Зависимость итогового промпта, с определенной длиной, от количества итераций:
Prompt with length 5:
Base prompt - xiNaN占据 же Mast Review:
--------------------------------------------------------------------------------
Best prompt after 20 iterations - емые_execution///////////////////////////////////////////////////////////////////////////////
 behaviours solicitud Review:
Best prompt after 50 iterations -  suicides Noble[from SKF烷 Review:
Best prompt after 70 iterations -  suicides Noble[from SKF烷 Review:
Best prompt after 100 iterations -  suicides Noble[from SKF烷 Review:
--------------------------------------------------------------------------------

Prompt with length 10:
Base prompt - ansk╝basenameStation(bp inser.GetInstance yetiştir🆖 nh Review:
--------------------------------------------------------------------------------
Best prompt after 20 iterations - 変IRCLE.SECONDS戴上 Satellite⽗长大了!	Element! Review:
Best prompt after 50 iterations - 変IRCLE.SECONDS戴上 Satellite⽗长大了!	Elemen