In [1]:
import argparse
from open_flamingo import create_model_and_transforms
# grab model checkpoint from huggingface hub
from huggingface_hub import hf_hub_download
import os
import pandas as pd
import torch
import random
from PIL import Image
import time

def inference(demo_images, query, max_new_tokens=20, verbose=True):
    """
    Step 2: Preprocessing images
    Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
    batch_size x num_media x num_frames x channels x height x width. 
    In this case batch_size = 1, num_media = 3, num_frames = 1,
    channels = 3, height = 224, width = 224.
    """
    vision_x = []

    for img in demo_images:
        vision_x.append(image_processor(img).unsqueeze(0))

    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0)
    
    """
    Step 3: Preprocessing text
    Details: In the text we expect an <image> special token to indicate where an image is.
    We also expect an <|endofchunk|> special token to indicate the end of the text 
    portion associated with an image.
    """
    tokenizer.padding_side = "left" # For generation padding tokens should be on the left
    lang_x = tokenizer(
        [query],
        return_tensors="pt",
    )

    """
    Step 4: Generate text
    """
    generated_text = model.generate(
        vision_x=vision_x.cuda(),
        lang_x=lang_x["input_ids"].cuda(),
        attention_mask=lang_x["attention_mask"].cuda(),
        max_new_tokens=max_new_tokens,
        num_beams=1,
    )
    torch.cuda.empty_cache()

    generated_text = tokenizer.decode(generated_text[0][len(lang_x["input_ids"][0]):])
    if verbose:
        print("### Generated text: ", generated_text)
    return generated_text



In [2]:
model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
    tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
    cross_attn_every_n_layers=1,
    # cache_dir="/scratch/workspace/asureddy_umass_edu-llm_alignment/hf-cache"  # Defaults to ~/.cache
    )
    
checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
model.load_state_dict(torch.load(checkpoint_path), strict=False)
model.to("cuda")

print("Model Loaded")

A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- configuration_mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
  return self.fget.__get__(instance, owner)()


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.
Flamingo model initialized with 1046992944 trainable parameters
Model Loaded


### Few-Shot Prompt
- 0, 2, 4, 8 - 100 classes with 10 examples each

In [3]:
test_data = pd.read_pickle("stanford_cars_test_data_1k_samples.pickle")
print(test_data.shape)
test_data.head(3)

(1000, 4)


Unnamed: 0,image,class,true_class_name,image_path
2478,02479.jpg,83,Dodge Caliber Wagon 2012,/scratch/workspace/dsaluru_umass_edu-email/sta...
7215,07216.jpg,83,Dodge Caliber Wagon 2012,/scratch/workspace/dsaluru_umass_edu-email/sta...
3898,03899.jpg,83,Dodge Caliber Wagon 2012,/scratch/workspace/dsaluru_umass_edu-email/sta...


In [4]:
random_data = pd.read_pickle("stanford_cars_rice_data_1k_samples.pickle")
print(random_data.shape)
random_data.head(4)

(96, 4)


Unnamed: 0,image,class,true_class_name,image_path
7585,07586.jpg,14,Audi TTS Coupe 2012,/scratch/workspace/dsaluru_umass_edu-email/sta...
6405,06406.jpg,3,Acura TL Sedan 2012,/scratch/workspace/dsaluru_umass_edu-email/sta...
6227,06228.jpg,106,Ford F-450 Super Duty Crew Cab 2012,/scratch/workspace/dsaluru_umass_edu-email/sta...
4210,04211.jpg,96,Dodge Charger Sedan 2012,/scratch/workspace/dsaluru_umass_edu-email/sta...


In [5]:
import random


In [6]:
n_few_shot = 0
count = 0

test_set = []
for test_idx in range(len(test_data)):

    if test_idx%100 == 0:
        print(f"######################## {test_idx} processed")
    sample = test_data.iloc[test_idx]
    
    target_label = sample['true_class_name']

    # random demonstrations
    random.seed(count)
    indexes = random.sample(list(range(len(random_data))), n_few_shot)
    pick_n_files = []
    pick_n_files_names = []
    for rd_idx in indexes:
        ex_image_path = random_data.iloc[rd_idx]['image_path']
        pick_n_files_names.append(ex_image_path)
        ex_label = random_data.iloc[rd_idx]['true_class_name']
        ex_demo_image = Image.open(ex_image_path)
        pick_n_files.append((ex_demo_image, ex_label))
    
    curr_demo_images = []
    few_shot_query =f""""""
    for k in range(n_few_shot):
        curr_demo_images.append(pick_n_files[k][0])
        few_shot_query += f"<image>\nQuestion: Identify and classify the car in the provided image. Provide the label in the exact format: [Make] [Model] [Year].\nAnswer: {pick_n_files[k][1]} |<endofchunk>|\n\n"
    
    
    # add test sample
    curr_image_path = sample['image_path']
    curr_demo_image = Image.open(curr_image_path)
    curr_demo_images.append(curr_demo_image)
    few_shot_query += "<image>\nQuestion: Identify and classify the car in the provided image. Provide the label in the exact format: [Make] [Model] [Year].\nAnswer:"
    
    test_set.append([test_idx, target_label, pick_n_files_names, few_shot_query, curr_demo_images])
    count += 1

######################## 0 processed
######################## 100 processed
######################## 200 processed
######################## 300 processed
######################## 400 processed
######################## 500 processed
######################## 600 processed
######################## 700 processed
######################## 800 processed
######################## 900 processed


In [7]:

test_df = pd.DataFrame(test_set, columns = ['index', 'target_label', 'pick_n_files_names', 'few_shot_query', 'demo_images'])
print(test_df.shape)
test_df.head(2)

(1000, 5)


Unnamed: 0,index,target_label,pick_n_files_names,few_shot_query,demo_images
0,0,Dodge Caliber Wagon 2012,[],<image>\nQuestion: Identify and classify the c...,[<PIL.JpegImagePlugin.JpegImageFile image mode...
1,1,Dodge Caliber Wagon 2012,[],<image>\nQuestion: Identify and classify the c...,[<PIL.JpegImagePlugin.JpegImageFile image mode...


### test the prompt

In [8]:
sample = test_df.iloc[5]
curr_generated_response = inference(demo_images=sample['demo_images'], query=sample['few_shot_query'], max_new_tokens=15, verbose=False)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [9]:
sample['target_label']

'Dodge Caliber Wagon 2012'

In [10]:
print(curr_generated_response)

 The car in the image is a 2007 Dodge Caliber.<|endofchunk|>


In [11]:
print(sample['few_shot_query'])

<image>
Question: Identify and classify the car in the provided image. Provide the label in the exact format: [Make] [Model] [Year].
Answer:


### Run the model

In [12]:
print("######### Running inference")

start = time.time()

all_responses = []
for i in range(len(test_df)):
    if i%200 == 0:
        print(f"############### {i} values are processed")

    sample = test_df.iloc[i]
    curr_generated_response = inference(demo_images=sample['demo_images'], query=sample['few_shot_query'], max_new_tokens=15, verbose=False)
    all_responses.append(curr_generated_response)
    torch.cuda.empty_cache()
end = time.time()

print("############# Total time taken for inference : ", (end-start))

######### Running inference
############### 0 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############### 200 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############### 400 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############### 600 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############### 800 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############# Total time taken for inference :  2205.1143429279327


In [13]:
test_df['raw_responses'] = all_responses
test_df.drop(['demo_images'], axis=1).to_pickle("random_zero_shot.pickle")
test_df['predicted_label'] = test_df['raw_responses'].apply(lambda x: x.split(" |")[0][1:])
acc = test_df[test_df.apply(lambda x: 1 if (x['predicted_label'] in x['target_label']) else 0, 1) == 1].shape[0]/len(test_df)
print(f"Accuracy at {4} shot: ", acc)
test_df.drop(['demo_images'], axis=1).to_pickle("random_zero_shot.pickle")

Accuracy at 4 shot:  0.0


In [22]:
.value_counts()

raw_responses
Ford F-150                                                 15
Aston Martin DB9                                           12
Buick Enclave                                              12
Chevrolet Impala                                           11
Hyundai Sonata                                             11
                                                           ..
The vehicle in the image is a white Dodge Grand Caravan     1
white Dodge Grand Caravan                                   1
 Chrysler Town & Country                                    1
2005 Buick Enclave                                          1
2006 Nissan Altima                                          1
Name: count, Length: 481, dtype: int64

In [24]:
test_df['predicted_label'] = test_df['raw_responses'].apply(lambda x: x.replace("The car is a", "").replace("The car in the image is a ", "").split(".")[0])
acc = test_df[test_df.apply(lambda x: 1 if (x['predicted_label'] == x['target_label']) else 0, 1) == 1].shape[0]/len(test_df)
print(f"Accuracy at zero shot: ", acc)

Accuracy at zero shot:  0.0


### Random few-shot performance

1. 0 - 
2. 2 - 
3. 4 - 
4. 8 - 