In [1]:
import torch
import pandas as pd 
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess

In [4]:
train_df = pd.read_json("hateful_memes/train.jsonl", lines=True)
test_df = pd.read_json("hateful_memes/dev_seen.jsonl", lines=True)

train_df = train_df[["label", "img", "text"]]
test_df = test_df[["label", "img", "text"]]

print(len(train_df), "train samples")
print(len(test_df), "test samples")

8500 train samples
500 test samples


In [7]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
    
model, vis_processors, _ = load_model_and_preprocess(
    name="blip_caption", model_type="large_coco", is_eval=True, device=device
)

## Apply BLIP2 captioning to every image per row in train and test dataframes

In [10]:
def img_to_blip(row):
    hateful_memes_dir = "hateful_memes"
    img = Image.open(hateful_memes_dir + "/" + row["img"]).convert("RGB")
    img = vis_processors["eval"](img).unsqueeze(0).to(device)
    output = model.generate({"image": img}, num_beams=1)
    return output

In [11]:
train_df['blip2_caption'] = train_df.apply(img_to_blip, axis=1)
test_df['blip2_caption'] = test_df.apply(img_to_blip, axis=1)

### Pickle to file

In [24]:
train_df.to_pickle("blip2_augmented_fhm_train.pkl")
test_df.to_pickle("blip2_augmented_fhm_test.pkl")