In [1]:
!nvidia-smi

Tue Nov  5 19:07:57 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:5F:00.0 Off |                    0 |
| N/A   31C    P0             68W /  500W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from open_flamingo import create_model_and_transforms

model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
    tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
    cross_attn_every_n_layers=1,
    # cache_dir="/scratch/workspace/asureddy_umass_edu-llm_alignment/hf-cache"  # Defaults to ~/.cache
)

A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- configuration_mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
  return self.fget.__get__(instance, owner)()


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.
Flamingo model initialized with 1046992944 trainable parameters


In [3]:
# grab model checkpoint from huggingface hub
from huggingface_hub import hf_hub_download
import torch

checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
model.load_state_dict(torch.load(checkpoint_path), strict=False)

_IncompatibleKeys(missing_keys=['vision_encoder.class_embedding', 'vision_encoder.positional_embedding', 'vision_encoder.proj', 'vision_encoder.conv1.weight', 'vision_encoder.ln_pre.weight', 'vision_encoder.ln_pre.bias', 'vision_encoder.transformer.resblocks.0.ln_1.weight', 'vision_encoder.transformer.resblocks.0.ln_1.bias', 'vision_encoder.transformer.resblocks.0.attn.in_proj_weight', 'vision_encoder.transformer.resblocks.0.attn.in_proj_bias', 'vision_encoder.transformer.resblocks.0.attn.out_proj.weight', 'vision_encoder.transformer.resblocks.0.attn.out_proj.bias', 'vision_encoder.transformer.resblocks.0.ln_2.weight', 'vision_encoder.transformer.resblocks.0.ln_2.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.bias', 'vision_encoder.transformer.resblocks.1.ln_1.weight', 'vision_encoder.transformer.resbloc

In [5]:
model = model.to('cuda')

In [4]:
from PIL import Image
import requests
import torch

In [None]:
"""
Step 1: Load images
"""
demo_image_one = Image.open(
    requests.get(
        "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
    ).raw
)

demo_image_two = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
        stream=True
    ).raw
)

query_image = Image.open(
    requests.get(
        "http://images.cocodataset.org/test-stuff2017/000000028352.jpg", 
        stream=True
    ).raw
)


In [7]:
dataset_dir = "/scratch/workspace/asureddy_umass_edu-llm_alignment/dataset/val2017"

In [12]:
im1 = f"{dataset_dir}/000000039769.jpg"
demo_image_one = Image.open(im1)
# "There are two cats laying down with two remotes"
im2 = f"{dataset_dir}/000000500826.jpg"
demo_image_two = Image.open(im2)
# im2
# "A pole with three road signs stands in front of a building."
im3 = f"{dataset_dir}/000000181421.jpg"
query_image = Image.open(im3)
# query_image
# "Three people in a boat with an umbrella in the rain "

In [17]:
query = "<image>There are two cats laying down with two remotes.<|endofchunk|><image>A pole with three road signs stands in front of a building..<|endofchunk|><image>"

In [18]:

"""
Step 2: Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""
vision_x = [image_processor(demo_image_one).unsqueeze(0), image_processor(demo_image_two).unsqueeze(0), image_processor(query_image).unsqueeze(0)]
vision_x = torch.cat(vision_x, dim=0)
vision_x = vision_x.unsqueeze(1).unsqueeze(0)

"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
tokenizer.padding_side = "left" # For generation padding tokens should be on the left
lang_x = tokenizer(
    [query],
    return_tensors="pt",
)


"""
Step 4: Generate text
"""
generated_text = model.generate(
    vision_x=vision_x.cuda(),
    lang_x=lang_x["input_ids"].cuda(),
    attention_mask=lang_x["attention_mask"].cuda(),
    max_new_tokens=20,
    num_beams=3,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated text:  <image>There are two cats laying down with two remotes.<|endofchunk|><image>A pole with three road signs stands in front of a building..<|endofchunk|><image>A woman in a yellow umbrella sits in a boat on a lake.<|endofchunk|>


In [16]:
print("Generated text: ", tokenizer.decode(generated_text[0])[len(query):])

Generated text:   a woman and two children in a boat on a lake.<|endofchunk|>


In [14]:
# icl for captioning task
lang_x = tokenizer(
    ["<image>\ncaption:two cats.<|endofchunk|><image>\ncaption:bathroom sink.<|endofchunk|><image>\ncaption:"],
    return_tensors="pt",
)
generated_text = model.generate(
    vision_x=vision_x,
    lang_x=lang_x["input_ids"],
    attention_mask=lang_x["attention_mask"],
    max_new_tokens=20,
    num_beams=1,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated text:  <image>
caption:two cats.<|endofchunk|><image>
caption:bathroom sink.<|endofchunk|><image>
caption:Thanksgiving dinner.<|endofchunk|>


In [8]:
# icl for captioning task
lang_x = tokenizer(
    ["<image>\ncaption:two cats.<|endofchunk|><image>\ncaption:bathroom sink.<|endofchunk|><image>\ncaption:"],
    return_tensors="pt",
)
generated_text = model.generate(
    vision_x=vision_x.to('cuda'),
    lang_x=lang_x["input_ids"].to('cuda'),
    attention_mask=lang_x["attention_mask"].to('cuda'),
    max_new_tokens=20,
    num_beams=1,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated text:  <image>
caption:two cats.<|endofchunk|><image>
caption:bathroom sink.<|endofchunk|><image>
caption:Thanksgiving dinner.<|endofchunk|>
