In [None]:
import argparse
from open_flamingo import create_model_and_transforms
# grab model checkpoint from huggingface hub
from huggingface_hub import hf_hub_download
import os
import pandas as pd
import torch
import random
from PIL import Image
import time

def inference(demo_images, query, max_new_tokens=20, verbose=True):
    """
    Step 2: Preprocessing images
    Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
    batch_size x num_media x num_frames x channels x height x width. 
    In this case batch_size = 1, num_media = 3, num_frames = 1,
    channels = 3, height = 224, width = 224.
    """
    vision_x = []

    for img in demo_images:
        vision_x.append(image_processor(img).unsqueeze(0))

    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0)
    
    """
    Step 3: Preprocessing text
    Details: In the text we expect an <image> special token to indicate where an image is.
    We also expect an <|endofchunk|> special token to indicate the end of the text 
    portion associated with an image.
    """
    tokenizer.padding_side = "left" # For generation padding tokens should be on the left
    lang_x = tokenizer(
        [query],
        return_tensors="pt",
    )

    """
    Step 4: Generate text
    """
    generated_text = model.generate(
        vision_x=vision_x.cuda(),
        lang_x=lang_x["input_ids"].cuda(),
        attention_mask=lang_x["attention_mask"].cuda(),
        max_new_tokens=max_new_tokens,
        num_beams=1,
    )
    torch.cuda.empty_cache()

    generated_text = tokenizer.decode(generated_text[0][len(lang_x["input_ids"][0]):])
    if verbose:
        print("### Generated text: ", generated_text)
    return generated_text

In [2]:
model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
    tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
    cross_attn_every_n_layers=1,
    # cache_dir="/scratch/workspace/asureddy_umass_edu-llm_alignment/hf-cache"  # Defaults to ~/.cache
    )
    
checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
model.load_state_dict(torch.load(checkpoint_path), strict=False)
model.to("cuda")

print("Model Loaded")

A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- configuration_mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/anas-awadalla/mpt-1b-redpajama-200b:
- mosaic_gpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
  return self.fget.__get__(instance, owner)()


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.
Flamingo model initialized with 1046992944 trainable parameters
Model Loaded


### Few-Shot Prompt
- 0, 2, 4, 8 - 100 classes with 10 examples each

1. 0 - 5.7% (if predicted label is in the substring of the label, exact label is almost zero)
    - here try to change the prompt for decent performance
2. 2 - 20.1%, ,
3. 4 - 19.7%, ,
4. 8 - 20.7%

In [24]:
val_dir = "/scratch/workspace/dsaluru_umass_edu-email/imagenet/imagenet/val"
random.seed(42)

with open("/project/pi_hongyu_umass_edu/zonghai/clinical-llm-alignment/durga_sandeep/icl-on-VLMs/flamingo/classification/imagenet/LOC_synset_mapping.txt", 'r') as f:
    mapping = f.readlines()
file_to_label_dict = {x.split(" ")[0] : (" ".join(x.replace("\n", "").lower().split(" ")[1:])).split(", ") for x in mapping}
all_folder_names = list(file_to_label_dict.keys())

final_folder_names = random.sample(all_folder_names, 200)
final_folder_names = final_folder_names[:100]
final_folder_names[:10]

['n03769881',
 'n01945685',
 'n01629819',
 'n04069434',
 'n02123045',
 'n02110185',
 'n02105505',
 'n02033041',
 'n04041544',
 'n01877812']

In [25]:
folder_to_files = {}
folder_to_other_folders = {}

for folder in final_folder_names:
    folder_dir = f"{val_dir}/{folder}/"
    other_folders = [x for x in final_folder_names if x != folder]
    folder_to_other_folders[folder] = other_folders
    all_folder_files = os.listdir(folder_dir)
    folder_to_files[folder] = random.sample(all_folder_files, 10)

In [21]:
# save_df = []
# for f in folder_to_files:
#     for ele in folder_to_files[f]:
#         save_df.append([f, ele])
# pd.DataFrame(save_df, columns=['folder', 'filename']).to_pickle("rice_embeddings_file_1k_samples.pickle")

In [55]:
n_few_shot = 8
count = 0


print("######### Getting few-shot Prompt")
test_set = []

for folder in final_folder_names:
    other_folders = folder_to_other_folders[folder]
    target_label = file_to_label_dict[folder]
    for idx in range(10):
        if count%100==0:
            print(count)
        random.seed(count+1)
        pick_n_folders = random.sample(other_folders, n_few_shot)
        pick_n_files = []
        pick_n_files_names = []
        for ele in pick_n_folders:
            ex_file_name = random.sample(os.listdir(f"{val_dir}/{ele}/"), 1)[0]
            ex_label = file_to_label_dict[ele]

            ex_image_path = f"{val_dir}/{ele}/{ex_file_name}"
            ex_demo_image = Image.open(ex_image_path)
            pick_n_files.append((ex_demo_image, ex_label[0]))
            pick_n_files_names.append(ex_file_name)
        
        curr_demo_images = []
        few_shot_query =f""""""
        for k in range(n_few_shot):
            curr_demo_images.append(pick_n_files[k][0])
            few_shot_query += f"<image>\nQuestion: Classify the image into one of the imagenet1k label.\nAnswer: {pick_n_files[k][1]} |<endofchunk>|\n\n"
        
        
        # add test sample
        curr_image_path = f"{val_dir}/{folder}/{folder_to_files[folder][idx]}"
        curr_demo_image = Image.open(curr_image_path)
        curr_demo_images.append(curr_demo_image)
        few_shot_query += "<image>\nQuestion: Classify the image into one of the imagenet1k label.\nAnswer:"
        
        test_set.append([folder, target_label, folder_to_files[folder][idx], pick_n_folders, pick_n_files_names, few_shot_query, curr_demo_images])
        count += 1
print("Length of test data : ", count)

test_df = pd.DataFrame(test_set, columns = ['folder', 'target_label', "filename", 'pick_n_folders', 'pick_n_files_names', 'few_shot_query', 'demo_images'])

test_df.drop(['demo_images'], axis=1).to_pickle("./data/imagenet_sample_test_dataset_1k_with_random_eight_shot_prompt.pickle")
# test_df = test_df.sample(n=4000, random_state=2024)

######### Getting few-shot Prompt
0
100
200
300
400
500
600
700
800
900
Length of test data :  1000


In [None]:
print("######### Running inference")

start = time.time()

all_responses = []
for i in range(len(test_df)):
    if i%200 == 0:
        print(f"############### {i} values are processed")

    sample = test_df.iloc[i]
    curr_generated_response = inference(demo_images=sample['demo_images'], query=sample['few_shot_query'], max_new_tokens=10, verbose=False)
    all_responses.append(curr_generated_response)
    torch.cuda.empty_cache()
end = time.time()

print("############# Total time taken for inference : ", (end-start))

######### Running inference
############### 0 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############### 200 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############### 400 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############### 600 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

############### 800 values are processed


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [54]:
test_df['raw_responses'] = all_responses
test_df.drop(['demo_images'], axis=1).to_pickle("./data/imagenet_sample_test_dataset_1k_with_random_eight_shot_prompt.pickle")
test_df['predicted_label'] = test_df['raw_responses'].apply(lambda x: x.split(" |")[0][1:])
acc = test_df[test_df.apply(lambda x: 1 if (x['predicted_label'] in x['target_label']) else 0, 1) == 1].shape[0]/len(test_df)
print(f"Accuracy at {4} shot: ", acc)
test_df.drop(['demo_images'], axis=1).to_pickle("./data/imagenet_sample_test_dataset_1k_with_random_eight_shot_prompt.pickle")
print("Saved at ./data/imagenet_sample_test_dataset_1k_with_random_eight_shot_prompt.pickle")

Accuracy at 4 shot:  0.208
Saved at ./data/imagenet_sample_test_dataset_1k_with_random_eight_shot_prompt.pickle


In [44]:
# test_df['predicted_label'] = test_df['raw_responses'].apply(lambda x: x[len("The image is a "):].split(".")[0].lower().replace("picture of a ", ""))
# acc = test_df[test_df.apply(lambda x: 1 if (x['predicted_label'] in " ".join(x['target_label'])) else 0, 1) == 1].shape[0]/len(test_df)
# print(f"Accuracy at {4} shot: ", acc)

Accuracy at 4 shot:  0.057


In [45]:
# test_df['predicted_label']

In [41]:
test_df[['predicted_label', 'target_label']]

Unnamed: 0,predicted_label,target_label
0,bus,[minibus]
1,school bus,[minibus]
2,ford transit van,[minibus]
3,police van,[minibus]
4,van,[minibus]
...,...,...
995,bird on a,[oil filter]
996,filter for a car air filter,[oil filter]
997,fuel filter,[oil filter]
998,car engine,[oil filter]


In [15]:
import pandas as pd
df = pd.read_pickle("./data/imagenet_sample_test_dataset_1k_with_zero_shot_prompt.pickle")
df.shape

(10000, 8)

In [17]:
#df['predicted_label'] = df['raw_responses'].apply(lambda x: x[1:].split(" ")[0])
df['predicted_label'] = df['raw_responses'].apply(lambda x: x.split(" |")[0][1:])
acc = df[df.apply(lambda x: 1 if (x['predicted_label'] in x['target_label']) else 0, 1) == 1].shape[0]/len(df)
print(f"Accuracy at {2} shot: ", acc)

Accuracy at 2 shot:  0.2025
