# Arguments

In [1]:
import pickle
import os
import random
import math
from open_flamingo import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch
from PIL import Image
import requests
import torch
import time

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
DEVICE_NUM = 0
# SAMPLE_SIZE = 24
# NUM_TRIALS = 1
DATASET_NAME = 'idoll_woman'
SORT_KEY = 'choice_rate'
MODEL_SIZE = '9B'

# Prepare datasets

In [3]:
root_dir = './'
datasets = {}

with open(os.path.join(root_dir, 'Dataset00.pkl'), 'rb') as f:
    idoll_man = pickle.load(f)
datasets['idoll_man'] = idoll_man

with open(os.path.join(root_dir, 'Dataset01.pkl'), 'rb') as f:
    idoll_woman = pickle.load(f)
datasets['idoll_woman'] = idoll_woman

with open(os.path.join(root_dir, 'Dataset02.pkl'), 'rb') as f:
    paintings = pickle.load(f)
datasets['paintings'] = paintings

# Sampling

In [4]:


def create_bins(sorted_list, sample_size):
    # Create bins from the sorted list
    bin_size = max(1, math.ceil(len(sorted_list) / sample_size))
    bins = [sorted_list[i:i + bin_size] for i in range(0, len(sorted_list), bin_size)]
    return bins

def sample_from_bins(bins):
    # Randomly select one element from each bin
    return [random.choice(bin) for bin in bins if bin]

def shuffle_samples_with_indices(samples):
    indexed_samples = list(enumerate(samples))
    random.shuffle(indexed_samples)
    shuffled_samples, indices = zip(*indexed_samples)
    return list(shuffled_samples), list(indices)

def sort_once_sample_shuffle_multiple_trials(tuple_list, sort_key, reverse, sample_size, trials):
    """
    Example usage
    
    tuple_list = [
        (1, "http://example.com", "classA", '10.00%', '15.00%'),
        (2, "http://example.org", "classB", '5.50%', '20.00%'),
        (3, "http://example.net", "classC", '8.75%', '12.00%'),
    Outputs = sort_once_sample_shuffle_multiple_trials(tuple_list, 'win_rate', True, 2, 3)

    Outputs: 
        [
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (1, 'http://example.com', 'classA', '10.00%', '15.00%')]),
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (1, 'http://example.com', 'classA', '10.00%', '15.00%')]),
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (3, 'http://example.net', 'classC', '8.75%', '12.00%')
        ]
    """
    if sort_key not in {'win_rate', 'choice_rate'}:
        raise ValueError("sort_key must be 'win_rate' or 'choice_rate'")

    if sample_size < 1 or sample_size > len(tuple_list):
        raise ValueError("sample_size must be between 1 and the length of tuple_list")

    # Function to convert percentage string to float
    def convert_to_float(percentage_str):
        return float(percentage_str.rstrip('%'))

    # Determine the index for win_rate or choice_rate in the tuple
    index = 3 if sort_key == 'win_rate' else 4

    # Sort the list of tuples based on the specified index
    sorted_list = sorted(tuple_list, key=lambda x: convert_to_float(x[index]), reverse=reverse)

    # Create bins from the sorted list
    bins = create_bins(sorted_list, sample_size)

    results = []
    for _ in range(trials):
        # Sample from the bins bfor each trial
        sample = sample_from_bins(bins)
        shuffled_sample, original_indices = shuffle_samples_with_indices(sample)
        results.append((shuffled_sample, original_indices))

    return results

# Prepare model

In [5]:


if MODEL_SIZE == '3B':
    model, image_processor, tokenizer = create_model_and_transforms(
        clip_vision_encoder_path="ViT-L-14",
        clip_vision_encoder_pretrained="openai",
        lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
        tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
        cross_attn_every_n_layers=1
    )

    checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
    model.load_state_dict(torch.load(checkpoint_path), strict=False)
    model = model.to(DEVICE_NUM)
elif MODEL_SIZE == '9B':
    model, image_processor, tokenizer = create_model_and_transforms(
        clip_vision_encoder_path="ViT-L-14",
        clip_vision_encoder_pretrained="openai",
        lang_encoder_path="anas-awadalla/mpt-7b",
        tokenizer_path="anas-awadalla/mpt-7b",
        cross_attn_every_n_layers=4
    )

    checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
    model.load_state_dict(torch.load(checkpoint_path), strict=False)
    # model = model.to(DEVICE_NUM)

Using pad_token, but it is not set yet.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:20<00:00,  6.77s/it]


Flamingo model initialized with 1384781840 trainable parameters


# Prepare inputs

In [14]:
NUM_CONTEXT = 35 # (2 * 24)
NUM_QUERY = 7 # ( 2 * 4)

sampled_results = sort_once_sample_shuffle_multiple_trials(datasets[DATASET_NAME], sort_key=SORT_KEY, reverse=True, 
                                                       sample_size=(NUM_CONTEXT + NUM_QUERY), trials=1)
print(sampled_results[0][0])
print(len(sampled_results[0][0]))
print(sampled_results[0])

[0, 33, 9, 2, 4, 11, 14, 30, 17, 18, 6, 3, 7, 25, 24, 16, 8, 32, 26, 23, 27, 29, 19, 1, 12, 5, 13, 22, 21, 10, 31, 15, 28, 20]
34
([0, 33, 9, 2, 4, 11, 14, 30, 17, 18, 6, 3, 7, 25, 24, 16, 8, 32, 26, 23, 27, 29, 19, 1, 12, 5, 13, 22, 21, 10, 31, 15, 28, 20], [('4', 'https://img.piku.co.kr/w/uploads/98Kx6M/3f5b7254b6bddd2707e427b4cf1721a6.jpg', '윈터(에스파)', '9.61%', '75.79%'), ('112', 'https://img.piku.co.kr/w/uploads/98Kx6M/2181c157e9c95bf4312f1e2563cb84da.jpg', '린지(픽시)', '0.12%', '13.53%'), ('43', 'https://img.piku.co.kr/w/uploads/98Kx6M/225761684daa13dd02cfa4cc409b72c1.jpg', '지우(엔믹스)', '0.78%', '54.73%'), ('7', 'https://img.piku.co.kr/w/uploads/98Kx6M/20448f9895913595923d65c152eb0f4b.jpg', '이서(아이브)', '7.08%', '71.71%'), ('11', 'https://img.piku.co.kr/w/uploads/98Kx6M/e2a8e01c4950c7784c5ec0556f18a158.jpg', '리즈(아이브)', '5.80%', '67.34%'), ('53', 'https://img.piku.co.kr/w/uploads/98Kx6M/572d6481f924143c4065a96ebda6a88b.jpg', '신비(비비지)', '0.57%', '49.16%'), ('57', 'https://img.piku.co.kr/w/u

In [20]:
# sampled_results Length: NUM_TRIALS, SAMPLES: NUM_SAMPLES
NUM_CONTEXT = 30 
NUM_QUERY = 4

context_scores = {}
query_scores = {}

vision_context = []
for trial in sampled_results:
    samples = []
    for i, sample in enumerate(trial[1][0:NUM_CONTEXT]):
        print(i)
        x = image_processor(Image.open(
            f'./Dataset01/Data01_{sample[0]}.jpg'
        ))
        samples.append(x)
        context_scores[i] = sample[4]
        
    samples = torch.stack(samples, dim=0)
    print(samples.shape)
    vision_context.append(samples)

vision_context = torch.stack(vision_context, dim=0)
print("Context", vision_context.shape, "\n")


vision_query = []
for trial in sampled_results:
    samples = []
    for i, sample in enumerate(trial[1][NUM_CONTEXT:NUM_CONTEXT + NUM_QUERY + 1]):
        print(i)
        x = image_processor(Image.open(
            f'./Dataset01/Data01_{sample[0]}.jpg'
        ))
        query_scores[i] = sample[4]
        samples.append(x)
    samples = torch.stack(samples, dim=0)
    print(samples.shape)
    vision_query.append(samples)
vision_query = torch.stack(vision_query, dim=0)
print("Query", vision_query.shape)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
torch.Size([30, 3, 224, 224])
Context torch.Size([1, 30, 3, 224, 224]) 

0
1
2
3
torch.Size([4, 3, 224, 224])
Query torch.Size([1, 4, 3, 224, 224])


In [21]:
context_scores

{0: '75.79%',
 1: '13.53%',
 2: '54.73%',
 3: '71.71%',
 4: '67.34%',
 5: '49.16%',
 6: '44.82%',
 7: '23.63%',
 8: '41.35%',
 9: '41.27%',
 10: '59.55%',
 11: '67.65%',
 12: '59.22%',
 13: '31.92%',
 14: '35.04%',
 15: '43.53%',
 16: '55.87%',
 17: '18.75%',
 18: '31.41%',
 19: '35.43%',
 20: '29.94%',
 21: '26.73%',
 22: '40.25%',
 23: '72.32%',
 24: '47.45%',
 25: '62.43%',
 26: '46.82%',
 27: '37.62%',
 28: '38.58%',
 29: '52.89%'}

In [22]:
query_scores

{0: '23.28%', 1: '43.64%', 2: '28.49%', 3: '38.98%'}

In [23]:
context_answer = []
len_c = len(context_scores)

for i in range(len_c // 2):
    curr_ = list(context_scores.values())[i * 2: i * 2 + 2]
    answer = '(B)'
    if curr_[0] > curr_[1]:
        answer = '(A)'
    context_answer.append(answer)
        
    print(curr_)
print(context_answer)

['75.79%', '13.53%']
['54.73%', '71.71%']
['67.34%', '49.16%']
['44.82%', '23.63%']
['41.35%', '41.27%']
['59.55%', '67.65%']
['59.22%', '31.92%']
['35.04%', '43.53%']
['55.87%', '18.75%']
['31.41%', '35.43%']
['29.94%', '26.73%']
['40.25%', '72.32%']
['47.45%', '62.43%']
['46.82%', '37.62%']
['38.58%', '52.89%']
['(A)', '(B)', '(A)', '(A)', '(A)', '(B)', '(A)', '(B)', '(A)', '(B)', '(A)', '(B)', '(B)', '(A)', '(B)']


In [24]:
query_answer = []
len_q = len(query_scores)

for i in range(len_q // 2):
    curr_ = list(query_scores.values())[i * 2: i * 2 + 2]
    answer = '(B)'
    if curr_[0] > curr_[1]:
        answer = '(A)'
    query_answer.append(answer)
        
    print(curr_)
print(query_answer)

['23.28%', '43.64%']
['28.49%', '38.98%']
['(B)', '(B)']


In [25]:
def create_score_answer(scores):
    curr_answer = []
    len_ = len(scores)
    
    for i in range(len_ // 2):
        curr_ = list(scores.values())[i * 2: i * 2 + 2]
        answer = '(B)'
        if curr_[0] > curr_[1]:
            answer = '(A)'
        curr_answer.append(answer)
            
        print(curr_)
    print(curr_answer)
    return curr_answer

In [27]:
NUM_TRIALS = 2
task_prompt_text = ''
question_prompt = 'Choose the more aesthetic image. Options:'
len_c = len(context_scores)
len_q = len(query_scores)


context_text = task_prompt_text + ''.join([
    question_prompt + f"(A) <image> (B) <image>. Answer: {context_answer[i]} <|endofchunk|>"
    for i in range(len_c // 2)
])
context_text = context_text + f"<|endofchunk|>"
print(context_text, "\n")

queries_text = [
    question_prompt + f"(A) <image> (B) <image>. Answer:"
    for _ in range(NUM_TRIALS)
]

print(queries_text[0])

Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|

In [28]:
vision_context[None, 0, :].unsqueeze(2)[:, :, :].shape, vision_query[:, :, :].unsqueeze(2).shape

(torch.Size([1, 30, 1, 3, 224, 224]), torch.Size([1, 4, 1, 3, 224, 224]))

In [29]:
NUM_QUERY // 2

2

In [30]:
def predict(vision_context, vision_query, context_text, queries_text, query_answer, num):
    print(f"Query answer: {query_answer}")

    """
    Step 4: Generate text
    """
    with torch.no_grad():
        context_len = len(context_text)
        for q_i in range(NUM_QUERY // 2):
            start = time.time()
            print(f"Start time: {start}")
            vision_x = torch.concat([vision_context[None, 0, :].unsqueeze(2)[:, :num, :],  vision_query[:, q_i * 2: q_i * 2 + 2, :].unsqueeze(2)], dim=1)
            lang_x = tokenizer(
                [context_text + queries_text[0] for _ in range(1)], 
                return_tensors="pt"
            )
            generated_text = model.generate(
                vision_x=vision_x,
                lang_x=lang_x["input_ids"],
                attention_mask=lang_x["attention_mask"],
                max_new_tokens=3,
                num_beams=3,
            )
    
            generated = tokenizer.decode(generated_text[0])
            print("Context: ", generated[:context_len])
            print("Generated text: ", generated[context_len:])
            end = time.time()
            print(f"Time {end - start}")

In [32]:
white_ = torch.ones([3, 224, 224])
print(white_.shape)
# Original Query 
# ['60.53%', '57.23%']
# ['38.21%', '22.05%']
# ['28.60%', '54.87%']
# ['(A)', '(A)', '(B)']

vision_query_new = vision_query
choosen = [0, 2] 
for i in choosen:
    vision_query_new[0, i, :] = white_
print(vision_query_new.shape)
print(vision_query_new[0, 0])
vision_query_new.shape
new_answer = ['(B)', '(B)']



torch.Size([3, 224, 224])
torch.Size([1, 4, 3, 224, 224])
tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])


In [33]:
 predict(vision_context, vision_query_new, context_text, queries_text, new_answer, 48)

Query answer: ['(B)', '(B)', '(A)']
Start time: 1701733281.9379327


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|e

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|e

In [34]:
 predict(vision_context, vision_query, context_text, queries_text, query_answer, 48)

Query answer: ['(B)', '(B)']
Start time: 1701733559.013797


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|e

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|e

In [100]:
white_ = torch.ones([3, 224, 224])
print(white_.shape)
# Original Query 
# ['60.53%', '57.23%']
# ['38.21%', '22.05%']
# ['28.60%', '54.87%']
# ['(A)', '(A)', '(B)']

new_answer = []
vision_context_new = vision_context
context_scores_new = context_scores
print(vision_context_new.shape)
for i in range(vision_context.shape[1]// 2):
    curr = random.choice([i*2, i*2 + 1])
    vision_context_new[0, curr, :] = white_
    context_scores_new[curr] = '0.0%'
    print(curr)

context_answer_new = []
len_c = len(context_scores_new)

for i in range(len_c // 2):
    curr_ = list(context_scores_new.values())[i * 2: i * 2 + 2]
    answer = '(B)'
    if curr_[0] > curr_[1]:
        answer = '(A)'
    context_answer_new.append(answer)
        
    print(curr_)
print(context_answer_new)

torch.Size([3, 224, 224])
torch.Size([1, 48, 3, 224, 224])
0
3
5
6
8
10
12
15
16
18
21
22
25
26
28
31
33
34
37
39
41
42
44
46
['0.0%', '45.61%']
['52.35%', '0.0%']
['47.87%', '0.0%']
['0.0%', '41.55%']
['0.0%', '77.56%']
['0.0%', '57.58%']
['0.0%', '49.43%']
['59.95%', '0.0%']
['0.0%', '32.98%']
['0.0%', '30.75%']
['57.93%', '0.0%']
['0.0%', '53.77%']
['42.15%', '0.0%']
['0.0%', '42.53%']
['0.0%', '24.36%']
['64.76%', '0.0%']
['49.81%', '0.0%']
['0.0%', '45.39%']
['69.69%', '0.0%']
['44.85%', '0.0%']
['61.76%', '0.0%']
['0.0%', '42.84%']
['0.0%', '62.68%']
['0.0%', '50.37%']
['(B)', '(A)', '(A)', '(B)', '(B)', '(B)', '(B)', '(A)', '(B)', '(B)', '(A)', '(B)', '(A)', '(B)', '(B)', '(A)', '(A)', '(B)', '(A)', '(A)', '(A)', '(B)', '(B)', '(B)']


In [102]:
task_prompt_text = ''
question_prompt = 'Choose the more aesthetic image. Options:'
len_c = len(context_scores)
len_q = len(query_scores)


context_text = task_prompt_text + ''.join([
    question_prompt + f"(A) <image> (B) <image>. Answer: {context_answer_new[i]} <|endofchunk|>"
    for i in range(len_c // 2)
])
context_text = context_text + f"<|endofchunk|>"
print(context_text, "\n")

queries_text = [
    question_prompt + f"(A) <image> (B) <image>. Answer:"
    for _ in range(NUM_TRIALS)
]

print(queries_text[0])

Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|

In [106]:
 predict(vision_context_new, vision_query_new, context_text, queries_text, new_answer, 48)

Query answer: ['(B)', '(B)', '(A)']
Start time: 1701693849.3406339


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|e

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|e

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|e

### Score evaluated based on the final rate

In [107]:

3

# sampled_results Length: NUM_TRIALS, SAMPLES: NUM_SAMPLES
context_scores = {}
query_scores = {}

vision_context = []
for trial in sampled_results:
    samples = []
    for i, sample in enumerate(trial[1][0:NUM_CONTEXT]):
        print(i)
        x = image_processor(Image.open(
            f'./Dataset00/Data00_{sample[0]}.jpg'
        ))
        samples.append(x)
        context_scores[i] = sample[3]
        
    samples = torch.stack(samples, dim=0)
    print(samples.shape)
    vision_context.append(samples)

vision_context = torch.stack(vision_context, dim=0)
print("Context", vision_context.shape, "\n")


vision_query = []
for trial in sampled_results:
    samples = []
    for i, sample in enumerate(trial[1][NUM_CONTEXT:NUM_CONTEXT + NUM_QUERY + 1]):
        print(i)
        x = image_processor(Image.open(
            f'./Dataset00/Data00_{sample[0]}.jpg'
        ))
        query_scores[i] = sample[3]
        samples.append(x)
    samples = torch.stack(samples, dim=0)
    print(samples.shape)
    vision_query.append(samples)
vision_query = torch.stack(vision_query, dim=0)
print("Query", vision_query.shape)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
torch.Size([48, 3, 224, 224])
Context torch.Size([1, 48, 3, 224, 224]) 

0
1
2
3
4
5
6
torch.Size([7, 3, 224, 224])
Query torch.Size([1, 7, 3, 224, 224])


In [108]:
context_answer = []
len_c = len(context_scores)

for i in range(len_c // 2):
    curr_ = list(context_scores.values())[i * 2: i * 2 + 2]
    answer = '(B)'
    if curr_[0] > curr_[1]:
        answer = '(A)'
    context_answer.append(answer)
        
    print(curr_)
print(context_answer)

query_answer = []
len_q = len(query_scores)

for i in range(len_q // 2):
    curr_ = list(query_scores.values())[i * 2: i * 2 + 2]
    answer = '(B)'
    if curr_[0] > curr_[1]:
        answer = '(A)'
    query_answer.append(answer)
        
    print(curr_)
print(query_answer)

['0.38%', '0.98%']
['0.35%', '0.04%']
['0.47%', '0.45%']
['0.13%', '0.30%']
['0.02%', '11.31%']
['1.13%', '1.86%']
['0.06%', '0.50%']
['1.36%', '0.07%']
['0.19%', '0.11%']
['0.34%', '0.15%']
['1.21%', '4.51%']
['3.68%', '1.07%']
['0.11%', '2.75%']
['0.04%', '0.31%']
['0.15%', '0.06%']
['3.65%', '0.15%']
['0.65%', '2.15%']
['0.02%', '0.08%']
['4.20%', '0.04%']
['0.21%', '0.22%']
['2.93%', '3.17%']
['0.24%', '0.13%']
['0.21%', '2.28%']
['0.11%', '1.06%']
['(B)', '(A)', '(A)', '(B)', '(B)', '(B)', '(B)', '(A)', '(A)', '(A)', '(B)', '(A)', '(B)', '(B)', '(A)', '(A)', '(B)', '(B)', '(A)', '(B)', '(B)', '(A)', '(B)', '(B)']
['1.29%', '1.38%']
['0.30%', '0.02%']
['0.23%', '1.42%']
['(B)', '(A)', '(B)']


In [109]:
task_prompt_text = ''
question_prompt = 'Choose the more aesthetic image. Options:'
len_c = len(context_scores)
len_q = len(query_scores)


context_text = task_prompt_text + ''.join([
    question_prompt + f"(A) <image> (B) <image>. Answer: {context_answer[i]} <|endofchunk|>"
    for i in range(len_c // 2)
])
context_text = context_text + f"<|endofchunk|>"
print(context_text, "\n")

queries_text = [
    question_prompt + f"(A) <image> (B) <image>. Answer:"
    for _ in range(NUM_TRIALS)
]

print(queries_text[0])

Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|

In [110]:
 predict(vision_context, vision_query, context_text, queries_text, query_answer, 48)

Query answer: ['(B)', '(A)', '(B)']
Start time: 1701694527.374507


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|e

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|e

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Choose the more aesthetic image. Options:(A) <image>(B) <image>. Answer: (B) <|e

In [112]:
task_prompt_text = ''
question_prompt = 'Which image is better?. Options:'
len_c = len(context_scores)
len_q = len(query_scores)


context_text = task_prompt_text + ''.join([
    question_prompt + f"(A) <image> (B) <image>. Answer: {context_answer[i]} <|endofchunk|>"
    for i in range(len_c // 2)
])
context_text = context_text + f"<|endofchunk|>"
print(context_text, "\n")

queries_text = [
    question_prompt + f"(A) <image> (B) <image>. Answer:"
    for _ in range(NUM_TRIALS)
]

print(queries_text[0])

Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is b

In [None]:
 predict(vision_context, vision_query, context_text, queries_text, query_answer, 48)

Query answer: ['(B)', '(A)', '(B)']
Start time: 1701695200.1250484


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which 

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Context:  Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (B) <|endofchunk|>Which image is better?. Options:(A) <image>(B) <image>. Answer: (A) <|endofchunk|>Which 

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
