# Arguments

In [1]:
SAMPLE_SIZE = 3
NUM_TRIALS = 5
DATASET_NAME = 'idoll_man'
SORT_KEY = 'choice_rate'

MODEL_SIZE = '9B'

# Prepare datasets

In [2]:
import pickle
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="7"

root_dir = './'
datasets = {}

with open(os.path.join(root_dir, 'Dataset00.pkl'), 'rb') as f:
    idoll_man = pickle.load(f)
datasets['idoll_man'] = idoll_man

with open(os.path.join(root_dir, 'Dataset01.pkl'), 'rb') as f:
    idoll_woman = pickle.load(f)
datasets['idoll_woman'] = idoll_woman

with open(os.path.join(root_dir, 'Dataset02.pkl'), 'rb') as f:
    paintings = pickle.load(f)
datasets['paintings'] = paintings

# Sampling

In [3]:
import random
import math

def create_bins(sorted_list, sample_size):
    # Create bins from the sorted list
    bin_size = max(1, math.ceil(len(sorted_list) / sample_size))
    bins = [sorted_list[i:i + bin_size] for i in range(0, len(sorted_list), bin_size)]
    return bins

def sample_from_bins(bins):
    # Randomly select one element from each bin
    return [random.choice(bin) for bin in bins if bin]

def shuffle_samples_with_indices(samples):
    indexed_samples = list(enumerate(samples))
    random.shuffle(indexed_samples)
    shuffled_samples, indices = zip(*indexed_samples)
    return list(shuffled_samples), list(indices)

def sort_once_sample_shuffle_multiple_trials(tuple_list, sort_key, reverse, sample_size, trials):
    """
    Example usage
    
    tuple_list = [
        (1, "http://example.com", "classA", '10.00%', '15.00%'),
        (2, "http://example.org", "classB", '5.50%', '20.00%'),
        (3, "http://example.net", "classC", '8.75%', '12.00%'),
    Outputs = sort_once_sample_shuffle_multiple_trials(tuple_list, 'win_rate', True, 2, 3)

    Outputs: 
        [
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (1, 'http://example.com', 'classA', '10.00%', '15.00%')]),
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (1, 'http://example.com', 'classA', '10.00%', '15.00%')]),
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (3, 'http://example.net', 'classC', '8.75%', '12.00%')
        ]
    """
    if sort_key not in {'win_rate', 'choice_rate'}:
        raise ValueError("sort_key must be 'win_rate' or 'choice_rate'")

    if sample_size < 1 or sample_size > len(tuple_list):
        raise ValueError("sample_size must be between 1 and the length of tuple_list")

    # Function to convert percentage string to float
    def convert_to_float(percentage_str):
        return float(percentage_str.rstrip('%'))

    # Determine the index for win_rate or choice_rate in the tuple
    index = 3 if sort_key == 'win_rate' else 4

    # Sort the list of tuples based on the specified index
    sorted_list = sorted(tuple_list, key=lambda x: convert_to_float(x[index]), reverse=reverse)

    # Create bins from the sorted list
    bins = create_bins(sorted_list, sample_size)

    results = []
    for _ in range(trials):
        # Sample from the bins for each trial
        sample = sample_from_bins(bins)
        shuffled_sample, original_indices = shuffle_samples_with_indices(sample)
        results.append((shuffled_sample, original_indices))

    return results

# Prepare model

In [6]:
from open_flamingo import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch

if MODEL_SIZE == '3B':
    model, image_processor, tokenizer = create_model_and_transforms(
        clip_vision_encoder_path="ViT-L-14",
        clip_vision_encoder_pretrained="openai",
        lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
        tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
        cross_attn_every_n_layers=1
    )

    checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
    model.load_state_dict(torch.load(checkpoint_path), strict=False)
elif MODEL_SIZE == '9B':
    model, image_processor, tokenizer = create_model_and_transforms(
        clip_vision_encoder_path="ViT-L-14",
        clip_vision_encoder_pretrained="openai",
        lang_encoder_path="anas-awadalla/mpt-7b",
        tokenizer_path="anas-awadalla/mpt-7b",
        cross_attn_every_n_layers=4,
    )

    checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
    model.load_state_dict(torch.load(checkpoint_path), strict=False)

Using pad_token, but it is not set yet.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

ValueError: We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually.

# Prepare inputs

In [5]:
from PIL import Image
import requests
import torch

"""
Step 1: Loading and Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""
sampled_results = sort_once_sample_shuffle_multiple_trials(datasets[DATASET_NAME], sort_key=SORT_KEY, reverse=True, 
                                                       sample_size=SAMPLE_SIZE, trials=NUM_TRIALS)

vision_context = [
    image_processor(Image.open(
        requests.get(
            sample[1], stream=True
        ).raw
    ))
    for sample in sampled_results[0][1]
]
vision_context = torch.stack(vision_context, dim=0)
vision_context = vision_context.unsqueeze(1).unsqueeze(0)

vision_queries = [
    torch.stack([image_processor(Image.open(requests.get(sample[1], stream=True).raw)) 
                for sample in sampled_results[i + 1][1]], dim=0)
                    for i in range(NUM_TRIALS - 1)]
vision_queries = torch.stack(vision_queries, dim=0)
vision_queries = vision_queries.unsqueeze(2)

print("batch_size x num_media x num_frames x channels x height x width")
print(vision_context.shape, vision_queries.shape)

batch_size x num_media x num_frames x channels x height x width
torch.Size([1, 3, 1, 3, 224, 224]) torch.Size([4, 3, 1, 3, 224, 224])


In [17]:
"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
tokenizer.padding_side = "left" # For generation padding tokens should be on the left

# task_prompt_text = f"These are {SAMPLE_SIZE} images. Rank these images from most to least aligned with the standard of beauty."
task_prompt_text = f"Score these {SAMPLE_SIZE} images for aesthetic quality. The score cannot be exceed the number of images"

# context_text = "<image><|endofchunk|>" * SAMPLE_SIZE
# context_text = "<image>" * SAMPLE_SIZE
# context_prompt_text = task_prompt_text + " Answer: "
context_answer_text = list(map(str, sampled_results[0][0])) # 3, 1, 0, 2
# context_text = context_text + context_prompt_text + context_answer_text
context_text = ''.join([
    "<image>" + task_prompt_text + f" Answer: {context_answer_text[i]}" + "<|endofchunk|>"
    for i in range(SAMPLE_SIZE)
])

# queries_text = ["<image><|endofchunk|>" * SAMPLE_SIZE] * (NUM_TRIALS - 1)
# queries_text = ["<image>" * SAMPLE_SIZE] * (NUM_TRIALS - 1)
# query_prompt_text = task_prompt_text + " Answer: "
# queries_text = [
#     query_text + query_prompt_text for query_text in queries_text
# ]
queries_text = [
    "<image>" + task_prompt_text + f" Answer: "
    for i in range(SAMPLE_SIZE)
]
print(context_text)
print(queries_text[0])

<image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 1<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 0<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 2<|endofchunk|>
<image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 


In [32]:
"""
Step 4: Generate text
"""
for q_i in range(NUM_TRIALS - 1):
    for n_i in range(SAMPLE_SIZE):
        vision_x = torch.cat((vision_context, vision_queries[[q_i]][:, [n_i]]), dim=1)
        lang_x = tokenizer(
            context_text + queries_text[n_i],
            return_tensors="pt"
        )
        generated_text = model.generate(
            vision_x=vision_context,
            lang_x=lang_x["input_ids"],
            attention_mask=lang_x["attention_mask"],
            max_new_tokens=50,
            num_beams=3,
        )
        print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 1<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 0<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 2<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 001 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 002 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 003 Score these 3 images for aesthetic quality


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 1<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 0<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 2<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 001 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 002 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 003 Score these 3 images for aesthetic quality


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 1<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 0<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 2<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 001 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 002 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 003 Score these 3 images for aesthetic quality


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 1<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 0<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 2<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 001 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 002 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 003 Score these 3 images for aesthetic quality


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 1<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 0<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 2<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 001 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 002 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 003 Score these 3 images for aesthetic quality


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 1<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 0<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 2<|endofchunk|><image>Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 001 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 002 Score these 3 images for aesthetic quality. The score cannot be exceed the number of images Answer: 003 Score these 3 images for aesthetic quality


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
