In [1]:
import pickle
import numpy as np
from tqdm import tqdm
import torch
from eval_datasets import CaptionDataset
from flamingo_utils import get_flamingo_model_processor_tok
from rice import RICES



In [2]:
# input args
model_name_or_path = "openflamingo/OpenFlamingo-3B-vitl-mpt1b"
train_annotations_path = "../dataset/annotations/captions_train2017.json"
train_image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/train2017"
val_annotations_path = "../dataset/annotations/captions_val2017.json"
val_image_dir_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"
rice_cached_features_path = "/scratch/workspace/asureddy_umass_edu-llm_alignment/features-cache/coco_train.pkl" 

In [3]:
train_dataset = CaptionDataset(train_image_dir_path, train_annotations_path)
val_dataset = CaptionDataset(val_image_dir_path, val_annotations_path)
if rice_cached_features_path:
    with open(rice_cached_features_path, 'rb') as f:
        rice_cached_features = pickle.load(f)

retriever = RICES(train_dataset, 'cpu',1, cached_features=rice_cached_features)
model, image_processor, tokenizer = get_flamingo_model_processor_tok()
tokenizer.padding_side = "left" # For generation padding tokens should be on the left
model = model.to('cuda')

A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- configuration_mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
  return self.fget.__get__(instance, owner)()


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.
Flamingo model initialized with 1046992944 trainable parameters


In [4]:
def preprocess_images(ims, image_processor):
    vision_x = [image_processor(im_).unsqueeze(0) for im_ in ims]
    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0)
    return vision_x

In [5]:
def tok_query(query):
    """
    Example query:
    "<image>An image of two cats.<|endofchunk|><image>An image of a bathroom sink.<|endofchunk|><image>An image of"
    """
    lang_x = tokenizer(
        [query],
        return_tensors="pt",
    )
    return lang_x

In [6]:
def get_n_shot_demonstrations(item, n=2, use_random=True, n_random=0):
    if n==0: return [[]]
    if use_random:
        train_idxs = list(np.random.choice(len(train_dataset),n))
        icl_demonstrs = [[train_dataset[idx] for idx in train_idxs]]
    else:
        if n_random:
            train_idxs = list(np.random.choice(len(train_dataset),n_random))
            icl_demonstrs = [train_dataset[idx] for idx in train_idxs]
        icl_demonstrs = [icl_demonstrs + retriever.find([item['image']],n-n_random)[0]]
        
    return icl_demonstrs

In [7]:
def construct_captioning_query(query_items, icl_demonstrs_list):
    querys, im_lists = [], []
    for query_item,icl_demonstrs in zip(query_items, icl_demonstrs_list):
        query = "<image> Output: "
        images = []
        for item in icl_demonstrs:
            query += f"{item['caption']} |<endofchunk>| <image> Output: "
            images.append(item['image'])
        images.append(query_item['image'])
        querys.append(query)
        im_lists.append(images)
    return querys, im_lists

In [12]:
get_n_shot_demonstrations(val_dataset[0], n=4, use_random=False, n_random=2)

[[{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x375>,
   'caption': 'The green suitcase has many stickers on the side of it.',
   'image_id': 372458},
  {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x426>,
   'caption': 'a small white bed is in a room',
   'image_id': 339020},
  {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480>,
   'caption': 'A man putting food into a pot in a kitchen.',
   'image_id': 218674},
  {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480>,
   'caption': 'A white refrigerator freezer sitting next to a stone wall.',
   'image_id': 465374}]]

In [58]:
def get_output(item, n= 2, use_random= False):
    icl_demonstrs = get_n_shot_demonstrations(item, n, use_random)
    querys, im_lists = construct_captioning_query([item], icl_demonstrs)
    vision_x = preprocess_images(im_lists[0], image_processor)
    lang_x = tok_query(querys[0])
    generated_text = model.generate(
        vision_x=vision_x.cuda(),
        lang_x=lang_x["input_ids"].cuda(),
        attention_mask=lang_x["attention_mask"].cuda(),
        max_new_tokens=20,
        num_beams=1,
    )
    # print("Generated text: ", tokenizer.decode(generated_text[0]))
    return tokenizer.decode(generated_text[0])[len(querys[0]):]

In [60]:
outputs = []
for i in tqdm(range(100)):
    out = get_output(val_dataset[i],4)
    outputs.append(out)

  0%|                                                                                         | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|▊                                                                                | 1/100 [00:01<01:57,  1.19s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|█▌                                                                               | 2/100 [00:02<01:36,  1.01it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|██▍                                                                              | 3/100 [00:02<01:23,  1.16it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|███▏                                                                             | 4/100 [00:03<01:35,  1.01it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|████                                              

In [61]:
outputs

['\xa0A kitchen filled with lots of pots and pans. |<endofchunk>|  Output',
 '\xa0A kitchen with a small dining table in the middle.<|endofchunk|>',
 '\xa0A homeless man in New York City.<|endofchunk|>',
 '\xa0Skateboarders performing a trick on a skateboard. |<endofchunk>|',
 '\xa0A blue bicycle parked on a street. |<endofchunk>|  Output: A',
 '・A bathroom with toilet, sink, and shower.<|endofchunk|>',
 '\xa0A Japanese style toilet stall with a window.<|endofchunk|>',
 '\xa0A shiny motorcycle on display. |<endofchunk>|  Output: A shiny motorcycle',
 '\xa0A row of white and brown toilets in a bathroom. |<endofchunk>|',
 '\xa0A toilet with a panel for controls. |<endofchunk>|  Output: A',
 '\tA row of flats in a residential area. |<endofchunk>|  Output:',
 '\xa0A woman is sitting at a desk with a computer. |<endofchunk>| ',
 '\xa0A door leading to a graveyard.<|endofchunk|>',
 '\xa0A TV sitting on a bed with a bookcase behind it. |<endofchunk>',
 '\xa0A still life painting of a bowl of 

In [51]:
get_output(val_dataset[0],0)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<image> Output: \xa0The kitchen is a large room with a large stove, a large sink, and a large counter'

In [48]:
get_output(val_dataset[0])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<image> Output: A man putting food into a pot in a kitchen. |<endofchunk>| <image> Output: A white refrigerator freezer sitting next to a stone wall. |<endofchunk>| <image> Output: \xa0A cook preparing food in a kitchen. |<endofchunk>|  Output: A'

In [49]:
get_output(val_dataset[0],4)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<image> Output: A room filled with lots of pottery and a window. |<endofchunk>| <image> Output: A run down kitchen filled with lots of construction equipment. |<endofchunk>| <image> Output: A man putting food into a pot in a kitchen. |<endofchunk>| <image> Output: A white refrigerator freezer sitting next to a stone wall. |<endofchunk>| <image> Output: \xa0A kitchen filled with lots of pots and pans. |<endofchunk>|  Output'

In [55]:
get_output(val_dataset[0],4, True)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<image> Output: A close up of a pizza pie on a paper plate. |<endofchunk>| <image> Output: there is a large book shelf in this living room |<endofchunk>| <image> Output: A white bus parked in front of a  building. |<endofchunk>| <image> Output: A giraffe in the middle of a field looking astute |<endofchunk>| <image> Output: \xa0A cook in a kitchen in a museum. |<endofchunk>|  Output:'

In [50]:
get_output(val_dataset[0],8)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<image> Output: A very nice looking dining table by a bright window. |<endofchunk>| <image> Output: A building has tan wood floors and wooden walls. |<endofchunk>| <image> Output: A lady and a small girl in a room full of things. |<endofchunk>| <image> Output: A kitchen with a center island covered in clutter. |<endofchunk>| <image> Output: A room filled with lots of pottery and a window. |<endofchunk>| <image> Output: A run down kitchen filled with lots of construction equipment. |<endofchunk>| <image> Output: A man putting food into a pot in a kitchen. |<endofchunk>| <image> Output: A white refrigerator freezer sitting next to a stone wall. |<endofchunk>| <image> Output: \xa0A kitchen with a stove and a stove top. |<endofchunk>|  Output'

In [20]:
val_dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x427>,
 'caption': 'A man is in a kitchen making pizzas.',
 'image_id': 397133}

In [36]:
relevant = retriever.find([val_dataset[0]['image']], 4)
relevant

[[{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x427>,
   'caption': 'A room filled with lots of pottery and a window.',
   'image_id': 339830},
  {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x427>,
   'caption': 'A run down kitchen filled with lots of construction equipment.',
   'image_id': 326936},
  {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480>,
   'caption': 'A man putting food into a pot in a kitchen.',
   'image_id': 218674},
  {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480>,
   'caption': 'A white refrigerator freezer sitting next to a stone wall.',
   'image_id': 465374}]]

In [65]:
!pwd

/home/asureddy_umass_edu/cs682/flamingo


In [None]:
!python flamingo_e2e_captioning_coco.py --n_shots 4

A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- configuration_mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
  return self.fget.__get__(instance, owner)()
You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.
Flamingo model initialized with 1046992944 trainable parameters
  0%|                                                   | 0/100 [00:00<?, ?it/s]<image> Output: A room filled with lots of pottery and a window. |<endofchunk>| <image> Output: A