# Feature Extraction
CLIP for visual features extraction, BERT for text features extraction.

In [1]:
import torch
from PIL import Image
import os
import numpy as np
import pandas as pd

### Load baseline train/test data

In [2]:
train_df = pd.read_json("hateful_memes/train.jsonl", lines=True)
test_df = pd.read_json("hateful_memes/dev_seen.jsonl", lines=True)

print(len(train_df), "train samples")
print(len(test_df), "test samples")

8500 train samples
500 test samples


### Load train/test data augmented with blip2 captions

In [3]:
train_df_blip2 = pd.read_pickle("blip2_augmented_fhm_train.pkl")
test_df_blip2 = pd.read_pickle("blip2_augmented_fhm_test.pkl")

# The captions are in list form (e.g. ["caption"]), so convert to just string
train_df_blip2["blip2_caption"] = train_df_blip2["blip2_caption"].apply(lambda x: x[0])
test_df_blip2["blip2_caption"] = test_df_blip2["blip2_caption"].apply(lambda x: x[0])

print(len(train_df_blip2), "train samples")
print(len(test_df_blip2), "test samples")

8500 train samples
500 test samples


In [4]:
train_df_blip2.head()

Unnamed: 0,gold_hate,img,text,blip2_caption
0,[not_hateful],img/42953.png,its their character not their color that matters,a man with a bald head and a caption of a man ...
1,[not_hateful],img/23058.png,don't be afraid to love again everyone is not ...,a man and woman are standing close together
2,[not_hateful],img/13894.png,putting bows on your pet,a cat with a red bow on its neck
3,[not_hateful],img/37408.png,i love everything and everybody! except for sq...,a dog is sitting on the ground and a picture o...
4,[not_hateful],img/82403.png,"everybody loves chocolate chip cookies, even h...",a man in a suit and tie is holding a chocolate...


In [5]:
train_df.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


## CLIP
Using the ViT-B/32 model. Extracted features are saved in a new column in the pandas dataframe.

In [33]:
import clip

In [34]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

Applying image feature extraction to every row image. Both train and test sets

In [35]:
def extract_img_feature(row):
    hateful_memes_dir = "hateful_memes"
    img = Image.open(hateful_memes_dir + "/" + row["img"])
    img = preprocess(img).unsqueeze(0).to(device)

    with torch.no_grad():
        image_features = model.encode_image(img)

    return image_features.cpu().numpy()

In [None]:
train_df['img_features'] = train_df.apply(extract_img_feature, axis=1)
test_df['img_features'] = test_df.apply(extract_img_feature, axis=1)

Same images, so can just copy

In [37]:
train_df_blip2['img_features'] = train_df['img_features']
test_df_blip2['img_features'] = test_df['img_features']

## BERT
Using the base BERT model to encode the text. This is also saved in a new column in the dataframe.

In [39]:
from transformers import BertTokenizer, BertModel

In [40]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [41]:
def encode_text(row):
    inputs = tokenizer(row["text"], return_tensors='pt', truncation=True, padding=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    last_hidden_states = outputs.last_hidden_state
    embedding = last_hidden_states[:, 0, :].cpu().numpy()
    
    return embedding

def encode_caption(row):
    inputs = tokenizer(row["blip2_caption"], return_tensors='pt', truncation=True, padding=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    last_hidden_states = outputs.last_hidden_state
    embedding = last_hidden_states[:, 0, :].cpu().numpy()
    
    return embedding

In [None]:
train_df['text_features'] = train_df.apply(encode_text, axis=1)
test_df['text_features'] = test_df.apply(encode_text, axis=1)

Text features are also same, can just copy

In [42]:
train_df_blip2['text_features'] = train_df['text_features']
test_df_blip2['text_features'] = test_df['text_features']

In [43]:
train_df_blip2['blip_features'] = train_df_blip2.apply(encode_caption, axis=1)
test_df_blip2['blip_features'] = test_df_blip2.apply(encode_caption, axis=1)

## Pickle
The dataframes are eventually pickled to a file for later use.

In [64]:
train_df.to_pickle("fhm_clip_bert_features_train.pkl")
test_df.to_pickle("fhm_clip_bert_features_test.pkl")

In [44]:
train_df_blip2.to_pickle("fhm_clip_bert_features_blip2_train.pkl")
test_df_blip2.to_pickle("fhm_clip_bert_features_blip2_test.pkl")