In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="7"
os.chdir('/home/miso/agi/open_flamingo')

In [6]:
# Argument
MODEL_SIZE = '9B'
DEVICE_NUM = 7
SAMPLE_SIZE = 6
NUM_TRIALS = 5
DATASET_NAME = 'idoll_man'
SORT_KEY = 'choice_rate'

In [7]:
# prepare datasets
import pickle
import os

root_dir = '../'
datasets = {}

with open(os.path.join(root_dir, 'Dataset00.pkl'), 'rb') as f:
    idoll_man = pickle.load(f)
datasets['idoll_man'] = idoll_man

with open(os.path.join(root_dir, 'Dataset01.pkl'), 'rb') as f:
    idoll_woman = pickle.load(f)
datasets['idoll_woman'] = idoll_woman

with open(os.path.join(root_dir, 'Dataset02.pkl'), 'rb') as f:
    paintings = pickle.load(f)
datasets['paintings'] = paintings

In [8]:
import random
import math

def create_bins(sorted_list, sample_size):
    # Create bins from the sorted list
    bin_size = max(1, math.ceil(len(sorted_list) / sample_size))
    bins = [sorted_list[i:i + bin_size] for i in range(0, len(sorted_list), bin_size)]
    return bins

def sample_from_bins(bins):
    # Randomly select one element from each bin
    return [random.choice(bin) for bin in bins if bin]

def shuffle_samples_with_indices(samples):
    indexed_samples = list(enumerate(samples))
    random.shuffle(indexed_samples)
    shuffled_samples, indices = zip(*indexed_samples)
    return list(shuffled_samples), list(indices)

def sort_once_sample_shuffle_multiple_trials(tuple_list, sort_key, reverse, sample_size, trials):
    """
    Example usage
    
    tuple_list = [
        (1, "http://example.com", "classA", '10.00%', '15.00%'),
        (2, "http://example.org", "classB", '5.50%', '20.00%'),
        (3, "http://example.net", "classC", '8.75%', '12.00%'),
    Outputs = sort_once_sample_shuffle_multiple_trials(tuple_list, 'win_rate', True, 2, 3)

    Outputs: 
        [
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (1, 'http://example.com', 'classA', '10.00%', '15.00%')]),
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (1, 'http://example.com', 'classA', '10.00%', '15.00%')]),
            ([1, 0],
             [(2, 'http://example.org', 'classB', '5.50%', '20.00%'),
             (3, 'http://example.net', 'classC', '8.75%', '12.00%')
        ]
    """
    if sort_key not in {'win_rate', 'choice_rate'}:
        raise ValueError("sort_key must be 'win_rate' or 'choice_rate'")

    if sample_size < 1 or sample_size > len(tuple_list):
        raise ValueError("sample_size must be between 1 and the length of tuple_list")

    # Function to convert percentage string to float
    def convert_to_float(percentage_str):
        return float(percentage_str.rstrip('%'))

    # Determine the index for win_rate or choice_rate in the tuple
    index = 3 if sort_key == 'win_rate' else 4

    # Sort the list of tuples based on the specified index
    sorted_list = sorted(tuple_list, key=lambda x: convert_to_float(x[index]), reverse=reverse)

    # Create bins from the sorted list
    bins = create_bins(sorted_list, sample_size)

    results = []
    for _ in range(trials):
        # Sample from the bins for each trial
        sample = sample_from_bins(bins)
        shuffled_sample, original_indices = shuffle_samples_with_indices(sample)
        results.append((shuffled_sample, original_indices))

    return results

In [9]:
# prepare inputs
from PIL import Image
import requests
import torch

"""
Step 1: Loading and Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""
sampled_results = sort_once_sample_shuffle_multiple_trials(datasets[DATASET_NAME], sort_key=SORT_KEY, reverse=True, 
                                                       sample_size=SAMPLE_SIZE, trials=NUM_TRIALS)

In [None]:
''' 
Step 0: Initializing an OpenFlamingo model & Download pretrained weight
'''
from open_flamingo import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch

if MODEL_SIZE == '3B':
    model, image_processor, tokenizer = create_model_and_transforms(
        clip_vision_encoder_path="ViT-L-14",
        clip_vision_encoder_pretrained="openai",
        lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
        tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
        cross_attn_every_n_layers=1,
        cache_dir="PATH/TO/CACHE/DIR"  # Defaults to ~/.cache
    )

    # grab model checkpoint from huggingface hub
    from huggingface_hub import hf_hub_download
    import torch

    checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
    model.load_state_dict(torch.load(checkpoint_path), strict=False)
elif MODEL_SIZE == '9B':
    model, image_processor, tokenizer = create_model_and_transforms(
        clip_vision_encoder_path="ViT-L-14",
        clip_vision_encoder_pretrained="openai",
        lang_encoder_path="anas-awadalla/mpt-7b",
        tokenizer_path="anas-awadalla/mpt-7b",
        cross_attn_every_n_layers=4
    )

    checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
    model.load_state_dict(torch.load(checkpoint_path), strict=False)
    model = model.to('cuda')

In [11]:
model = model.to('cuda')

In [13]:
from PIL import Image
import requests
import torch

"""
Step 1: Load images
"""
demo_image_one = Image.open(
    requests.get(
        'https://img.piku.co.kr/w/uploads/721FCA/2c5e54bed48887def8cf122705cf5455.jpg', stream=True
    ).raw
)

demo_image_two = Image.open(
    requests.get(
        'https://img.piku.co.kr/w/uploads/721FCA/08055401abb0f348ecd2fae8bd3e02b3.jpg',
        stream=True
    ).raw
)

query_image = Image.open(
    requests.get(
        'https://img.piku.co.kr/w/uploads/721FCA/0270ada34d53b2b76139bc0b2131aa25.jpg', 
        stream=True
    ).raw
)


In [24]:
import json
f = open('/home/miso/agi/context_samples.json')
context_samples = json.load(f)

In [25]:
"""
Step 2: Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""

vision_context = [
	image_processor(
		Image.open(requests.get(sample[1], stream=True).raw)
	) for sample in context_samples[1] #sampled_results[0][1]
]

vision_context = torch.stack(vision_context, dim=0)
vision_context = vision_context[:3].unsqueeze(1).unsqueeze(0)

In [21]:
vision_context.shape

torch.Size([1, 3, 1, 3, 224, 224])

In [23]:
vision_query = [
	image_processor(
		Image.open(requests.get(sample[1], stream=True).raw)
	) for sample in sampled_results[3][1]
]
# vision_query = vision_query[0].unsqueeze(0).unsqueeze(1).unsqueeze(2)
vision_query = torch.stack(vision_query, dim=0).unsqueeze(1).unsqueeze(0)

In [26]:
context_prompt = [
	"With strong facial features and sharp eyes, he is ",
	"With a baby face and captivating eyes, he is ",
	"With a warm and masculine appearance, he is ",
	"With a charismatic gaze and fox-like features, he is ",
	"With a versatile and attractive appearance, he is ",
	"With bold eyebrows and heart-shaped lips, he is "
]

degree_of_attractiveness = {
	0: "extremely attractive",
	1: "very attractive",
	2: "quite attractive",
	3: "moderately attractive",
	4: "slightly attractive",
	5: "not attractive",
	}

# score = sampled_results[0][0]
score = samples[0]

context_prompt = [context + degree_of_attractiveness[score[i]] + "." for i, context in enumerate(context_prompt)]

context_prompt

['With strong facial features and sharp eyes, he is very attractive.',
 'With a baby face and captivating eyes, he is quite attractive.',
 'With a warm and masculine appearance, he is slightly attractive.',
 'With a charismatic gaze and fox-like features, he is moderately attractive.',
 'With a versatile and attractive appearance, he is extremely attractive.',
 'With bold eyebrows and heart-shaped lips, he is not attractive.']

In [27]:
full_context = ""
for i, context in enumerate(context_prompt[:3]):
	if i == 0:
		full_context = "<image>" + context
	else:
		full_context += "<|endofchunk|><image>" + context
query_text = "<|endofchunk|><image>With"

full_context + query_text

'<image>With strong facial features and sharp eyes, he is very attractive.<|endofchunk|><image>With a baby face and captivating eyes, he is quite attractive.<|endofchunk|><image>With a warm and masculine appearance, he is slightly attractive.<|endofchunk|><image>With'

In [28]:

"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
for t in range(NUM_TRIALS):
    print(f"Trial #{t}")
    vision_x = torch.cat((vision_context, vision_query[:,t,:].unsqueeze(1)), dim=1)

    tokenizer.padding_side = "left" # For generation padding tokens should be on the left
    lang_x = tokenizer(
        # ["<image>He has expressive eyes, effortlessly crafting a heart shape with his hands and emanating a charming appeal.<|endofchunk|><image>He has fair skin and sharp, elongated eyes<image>He has"],
        [full_context + query_text],
        return_tensors="pt"
    )


    """
    Step 4: Generate text
    """
    generated_text = model.generate(
        vision_x=vision_x.to('cuda'),
        # vision_x = torch.randn(vision_x.shape).to('cuda'),
        lang_x=lang_x["input_ids"].to('cuda'),
        attention_mask=lang_x["attention_mask"].to('cuda'),
        max_new_tokens=20,
        num_beams=3,
        early_stopping=True
    )

    print("Generated text: ", tokenizer.decode(generated_text[0]))

Trial #0


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
2023-12-05 09:22:44.253629: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>With strong facial features and sharp eyes, he is very attractive.<|endofchunk|><image>With a baby face and captivating eyes, he is quite attractive.<|endofchunk|><image>With a warm and masculine appearance, he is slightly attractive.<|endofchunk|><image>With a baby face and sharp eyes, he is very attractive.<|endofchunk|>
Trial #1


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>With strong facial features and sharp eyes, he is very attractive.<|endofchunk|><image>With a baby face and captivating eyes, he is quite attractive.<|endofchunk|><image>With a warm and masculine appearance, he is slightly attractive.<|endofchunk|><image>With a baby face and sharp eyes, he is very attractive.<|endofchunk|>
Trial #2


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>With strong facial features and sharp eyes, he is very attractive.<|endofchunk|><image>With a baby face and captivating eyes, he is quite attractive.<|endofchunk|><image>With a warm and masculine appearance, he is slightly attractive.<|endofchunk|><image>With a baby face and sharp eyes, he is very attractive.<|endofchunk|>
Trial #3


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>With strong facial features and sharp eyes, he is very attractive.<|endofchunk|><image>With a baby face and captivating eyes, he is quite attractive.<|endofchunk|><image>With a warm and masculine appearance, he is slightly attractive.<|endofchunk|><image>With a baby face and sharp eyes, he is very attractive.<|endofchunk|>
Trial #4
Generated text:  <image>With strong facial features and sharp eyes, he is very attractive.<|endofchunk|><image>With a baby face and captivating eyes, he is quite attractive.<|endofchunk|><image>With a warm and masculine appearance, he is slightly attractive.<|endofchunk|><image>With a baby face and captivating eyes, he is very attractive.<|endofchunk|>


In [17]:

"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
tokenizer.padding_side = "left" # For generation padding tokens should be on the left
lang_x = tokenizer(
    ["<image>With expressive eyes, effortlessly crafting a heart shape with his hands and emanating a charming appeal, he is very attractive.<|endofchunk|><image>With fair skin and sharp, elongated eyes, he is extremely attractive.<image>With"],
    return_tensors="pt",
)


"""
Step 4: Generate text
"""
generated_text = model.generate(
    vision_x=vision_x.to('cuda'),
    lang_x=lang_x["input_ids"].to('cuda'),
    attention_mask=lang_x["attention_mask"].to('cuda'),
    max_new_tokens=20,
    num_beams=3,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>With expressive eyes, effortlessly crafting a heart shape with his hands and emanating a charming appeal, he is very attractive.<|endofchunk|><image>With fair skin and sharp, elongated eyes, he is extremely attractive.<image>With sharp eyes and sharp features, he is very attractive.<|endofchunk|>
