# Fine-tuning ROBERTA + VIT for binary visual question answering (VQA)

![ViT architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg)

* ViT paper: https://arxiv.org/pdf/2010.11929.pdf
* ViT docs: https://huggingface.co/docs/transformers/model_doc/vit
* ROBERTA paper: https://arxiv.org/pdf/1907.11692.pdf
* ROBERTA docs: https://huggingface.co/docs/transformers/model_doc/roberta

## Set-up environment: Imports and Configurations

### Import libraries

In [151]:
# !pip install -qqq easy-vqa
# !pip install -qqq sentence_transformers transformers timm

In [None]:
# !pip install wandb
# !pip install -q evaluate rouge_score
# !pip install plotly

In [355]:
import os
import math
import warnings
import json
import numpy as np
import pandas as pd
import evaluate
import requests
import torch
import random
import argparse
import torchvision
from torch import nn
from PIL import Image
from tqdm import tqdm
from copy import deepcopy
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from torchvision.io import read_image
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from transformers import ViTImageProcessor, ViTForImageClassification
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModel, RobertaModel, RobertaTokenizer
import torchvision.transforms as T
from torchvision import transforms
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

In [343]:
DATA_PATH = '/home/jovyan/ars/vqa-research/VQA/data/'
ANNOTATIONS_PATH = DATA_PATH + 'abstract_v002_train2017_annotations.json'
QUESTIONS_PATH = DATA_PATH + 'OpenEnded_abstract_v002_train2017_questions.json'
VQA_TRAIN_PATH = DATA_PATH + 'VQA_train.csv'
IMAGE_DIR = DATA_PATH + 'scene_img_abstract_v002_train2017/'
IMAGE_PREFIX = 'abstract_v002_train2015_'
IMAGE_FORMAT = '.png'

In [344]:
SEED = 1234
warnings.filterwarnings('ignore')
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [345]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

Device: cuda


## Preprocess data

In [346]:
vqa_data = pd.read_csv(VQA_PATH, index_col=0)

with open(ANNOTATIONS_PATH) as f:
    annotations = json.load(f)['annotations']

with open(QUESTIONS_PATH) as f:
    questions = json.load(f)['questions']

In [347]:
def to_lowercase(vqa_data):
    X = vqa_data.copy()
    X[['question', 'answer']].apply(lambda x: x.str.lower(), axis=0)
    return X

In [348]:
vqa_data = to_lowercase(vqa_data)

In [349]:
def add_img_paths(vqa_data, img_dir, prefix, format='.png'):
    X = vqa_data.copy()
    X['image_path'] = \
        img_dir + \
        prefix + \
        (12 - vqa_data['image_id'].astype('str').str.len()).apply(lambda x: x * '0') + \
    vqa_data['image_id'].astype('str') + format
    return X

In [350]:
vqa_data = add_img_paths(vqa_data, IMG_DIR, IMG_PREFIX, IMG_FORMAT)

In [351]:
vqa_data.shape

(22055, 5)

In [352]:
vqa_data['label'] = vqa_data['answer'].apply(lambda x: 1 if x == 'yes' else 0)

In [353]:
vqa_data.head()

Unnamed: 0,image_id,question,question_id,answer,image_path,label
0,87,Is the boy having wine?,870,yes,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,1
1,900000870,Is the boy having wine?,900000870,no,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,0
2,14962,Is it night time?,149620,no,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,0
3,900149620,Is it night time?,900149620,yes,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,1
4,8277,Is the boy hanging from monkey bars?,82771,yes,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,1


In [354]:
del vqa_data['image_id']
del vqa_data['question_id']

train_df, temp_data = train_test_split(vqa_data, test_size=0.3, random_state=42)
test_df, eval_df = train_test_split(temp_data, test_size=0.5, random_state=42)

train_df.shape, test_df.shape, eval_df.shape

((15438, 4), (3308, 4), (3309, 4))

In [314]:
test_df = test_df.reset_index()
del test_df['index']
test_df.head()

Unnamed: 0,question,answer,image_path,label
0,Are they feeding the birds?,yes,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,1
1,Is there a flower vase on the table?,no,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,0
2,Are there two bookcases?,no,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,0
3,Are both people wearing white?,no,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,0
4,Is there a pond?,no,/home/jovyan/ars/vqa-research/VQA/data/scene_i...,0


## Load encoders for visual and text modalities

In [340]:
### ROBERTA for text ###
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
text_encoder = RobertaModel.from_pretrained(model_name)

for p in text_encoder.parameters():
    p.requires_grad = False

### VIT for text ###
image_processor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
image_encoder = AutoModel.from_pretrained("google/vit-base-patch16-224-in21k")

for p in image_encoder.parameters():
    p.requires_grad = False


image_encoder.to(device)
text_encoder.to(device)
print()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.





## Create Dataset with pairs text-images

In [327]:
class RobertaVitDataset(Dataset):

    def __init__(self,df,
                 image_encoder,
                 text_encoder,
                 image_processor,
                 tokenizer,
              ):
        self.df = df
        self.image_encoder = image_encoder
        self.text_encoder = text_encoder
        self.image_processor = image_processor
        self.tokenizer = tokenizer


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        image_file = self.df["image_path"][idx]
        question = self.df['question'][idx]
        image = Image.open(image_file).convert("RGB")
        label = self.df['label'][idx]

        # image = resize_transform(image)
        # image_inputs = T.ToTensor()(image).unsqueeze_(0)
        # image_inputs = image_inputs.to(device)
        # image_outputs = self.image_encoder(image_inputs)
        # image_embedding = image_outputs[0]
        # image_embedding = image_embedding.detach()
        # print("Image emb", image_embedding.shape)

        image_inputs = self.image_processor(image, return_tensors="pt")
        image_inputs = {k:v.to(device) for k,v in image_inputs.items()}
        image_outputs = self.image_encoder(**image_inputs)
        image_embedding = image_outputs.pooler_output
        image_embedding = image_embedding.view(-1)
        image_embedding = image_embedding.detach()
        # print("Image emb", image_embedding.shape)

        text_inputs = self.tokenizer(question, return_tensors="pt")
        text_inputs = {k:v.to(device) for k,v in text_inputs.items()}
        text_outputs = self.text_encoder(**text_inputs)
        text_embedding = text_outputs.pooler_output 
        text_embedding = text_embedding.view(-1)
        text_embedding = text_embedding.detach()
        # print("Text emb", text_embedding.shape)

        encoding={}
        encoding["image_emb"] = image_embedding
        encoding["text_emb"] = text_embedding
        encoding["label"] = torch.tensor(label)

        return encoding

In [328]:
train_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)

train_dataset = RobertaVitDataset(
                           df=train_df,
                           image_encoder = image_encoder,
                           text_encoder = text_encoder,
                           tokenizer = tokenizer,
                           image_processor = image_processor, 
                           )

eval_dataset = RobertaVitDataset(
                           df=eval_df,
                           image_encoder = image_encoder,
                           text_encoder = text_encoder,
                           tokenizer = tokenizer,
                           image_processor = image_processor,
                          )

In [329]:
batch_size = 32
eval_batch_size = 32
dataloader_train = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=batch_size)
dataloader_validation = DataLoader(eval_dataset,
                                   sampler=SequentialSampler(eval_dataset),
                                   batch_size=eval_batch_size)

In [331]:
criterion = nn.CrossEntropyLoss()%

## Initialize evaluate and train

In [383]:
def evaluate(dataloader_val):

    model.eval()
    loss_val_total = 0
    predictions, true_vals, confidence = [], [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch.values())

        inputs = {'image_emb':  batch[0],'text_emb': batch[1]}
        # print("PRINT IMG EMB")
        # print(batch[0].shape)
        # print(batch[0])
        # print("PRINT TEXT EMB")
        # print(batch[1].shape)
        # print(batch[1])
        # break

        with torch.no_grad():
            outputs = model(**inputs)

        labels =  batch[2]
        loss = criterion(outputs.view(-1, 13), labels.view(-1))
        loss_val_total += loss.item()
        probs   = torch.max(outputs.softmax(dim=1), dim=-1)[0].detach().cpu().numpy()
        outputs = outputs.argmax(-1)
        logits = outputs.detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        confidence.append(probs)

    loss_val_avg = loss_val_total/len(dataloader_val)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    confidence = np.concatenate(confidence, axis=0)

    return loss_val_avg, predictions, true_vals, confidence

In [None]:
def train():
    log_hdr  = "Epoch, train_loss, train_acc, val_loss, val_acc"
    train_f1s = []
    val_f1s = []
    train_losses = []
    val_losses = []
    min_val_loss = -1
    max_auc_score = 0
    epochs_no_improve = 0
    early_stopping_epoch = 3
    early_stop = False

    for epoch in tqdm(range(1, epochs+1)):
        model.train()
        loss_train_total = 0
        train_predictions, train_true_vals = [], []

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)

        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch.values())

            inputs = {'image_emb':  batch[0],'text_emb': batch[1]}
            labels =  batch[2]

            outputs = model(**inputs)
            loss = criterion(outputs.view(-1, 13), labels.view(-1))
            loss_train_total += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            logits = outputs.argmax(-1)
            logits = logits.detach().cpu().numpy()
            label_ids = labels.cpu().numpy()
            train_predictions.append(logits)
            train_true_vals.append(label_ids)

            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})



        train_predictions = np.concatenate(train_predictions, axis=0)
        train_true_vals = np.concatenate(train_true_vals, axis=0)

        tqdm.write(f'\nEpoch {epoch}')
        loss_train_avg = loss_train_total/len(dataloader_train)
        tqdm.write(f'Training loss: {loss_train_avg}')
        train_f1 = accuracy_score_func(train_predictions, train_true_vals)
        tqdm.write(f'Train Acc: {train_f1}')

        val_loss, predictions, true_vals,_ = evaluate(dataloader_validation)
        val_f1 = accuracy_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'Val Acc: {val_f1}')

        if val_f1 >= max_auc_score:
            tqdm.write('\nSaving best model')
            torch.save(model.state_dict(), f'/home/jovyan/ars/vqa-research/VQA/ROBERTA_VIT_model/models/easyvqa_finetuned_epoch_{epoch}.model')
            max_auc_score = val_f1

        train_losses.append(loss_train_avg)
        val_losses.append(val_loss)
        train_f1s.append(train_f1)
        val_f1s.append(val_f1)
        log_str  = "{}, {}, {}, {}, {}".format(epoch, loss_train_avg, train_f1, val_loss, val_f1)

        if min_val_loss < 0:
            min_val_loss = val_loss
        else:
            if val_loss < min_val_loss:
                min_val_loss = val_loss
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= early_stopping_epoch:
                    early_stop = True
                    break
                else:
                    continue


    if early_stop:
        print("Early Stopping activated at epoch -", epoch )
        print("Use the checkpoint at epoch - ", epoch - early_stopping_epoch)

    return train_losses, val_losses

## Our fusion network

### Get text-emb from roberta and img-emb from vit and concatenate isung linear layers

In [359]:
class RobertaVitFusionNetwork(nn.Module):
    def __init__(self, hyperparms=None):

        super(RobertaVitFusionNetwork, self).__init__()
        self.dropout = nn.Dropout(0.3)
        self.vision_projection = nn.Linear(2048, 768)
        self.text_projection = nn.Linear(512, 768)
        self.fc1 = nn.Linear(768, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.classifier = nn.Linear(256, 13)
        W = torch.Tensor(768, 768)
        self.W = nn.Parameter(W)
        self.relu_f = nn.ReLU()
        nn.init.kaiming_uniform_(self.W, a=math.sqrt(5))

    def forward(self, image_emb, text_emb):

        x1 = image_emb
        x1 = torch.nn.functional.normalize(x1, p=2, dim=1)
        Xv = self.relu_f(self.vision_projection(x1))

        x2 = text_emb
        x2 = torch.nn.functional.normalize(x2, p=2, dim=1)
        Xt = self.relu_f(self.text_projection(x2))

        Xvt = Xv * Xt
        Xvt = self.relu_f(torch.mm(Xvt, self.W.t()))

        Xvt = self.fc1(Xvt)
        Xvt = self.bn1(Xvt)
        Xvt = self.dropout(Xvt)
        Xvt = self.classifier(Xvt)

        return Xvt

In [360]:
# torch.cuda.empty_cache()
model = RobertaVitFusionNetwork()
model.to(device)

RobertaVitFusionNetwork(
  (dropout): Dropout(p=0.3, inplace=False)
  (vision_projection): Linear(in_features=2048, out_features=768, bias=True)
  (text_projection): Linear(in_features=512, out_features=768, bias=True)
  (fc1): Linear(in_features=768, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (classifier): Linear(in_features=256, out_features=13, bias=True)
  (relu_f): ReLU()
)

## Create optimizer and scheduler

In [336]:
optimizer = AdamW(model.parameters(),
                  lr=5e-5,
                  weight_decay = 1e-5,
                  eps=1e-8
                  )

epochs = 10
train_steps=20000
print("train_steps", train_steps)
warm_steps = train_steps * 0.1
print("warm_steps", warm_steps)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warm_steps,
                                            num_training_steps=train_steps)

train_steps 20000
warm_steps 2000.0


## Start train model

In [None]:
train_losses, val_losses =  train()
torch.cuda.empty_cache()
plt.plot(train_losses)
plt.plot(val_losses)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Evaluate model

In [None]:
test_dataset = EasyQADataset(
                           df=test_df,
                           image_encoder = image_encoder,
                           text_encoder = text_encoder,
                           tokenizer = tokenizer,
                           image_processor = image_processor
                           )

In [122]:
device = "cuda:0"
model.load_state_dict(torch.load('/home/jovyan/ars/vqa-research/VQA/ROBERTA_VIT_model/models/easyvqa_finetuned_epoch_4.model'))
model.to(device)

EasyQAMidFusionNetwork(
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=768, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (classifier): Linear(in_features=256, out_features=13, bias=True)
  (relu_f): ReLU()
)

### Evaluating Test Dataset

In [406]:
dataloader_test = DataLoader(test_dataset,
                            sampler=SequentialSampler(test_dataset),
                            batch_size=128)

_, preds, truths, confidence = evaluate(dataloader_test)

In [125]:
print(_,preds,truths,confidence)

1.3529989077494695 [0 0 0 ... 1 0 0] [1 0 0 ... 0 1 0] [0.29199582 0.27244446 0.26816538 ... 0.2706765  0.28277642 0.32675567]


In [364]:
# Calculate accuracy
def calc_accuracy(preds, truths):
    correct_predictions = sum([1 for p, g in zip(preds, truths) if p == g])
    total_predictions = len(preds)
    accuracy = correct_predictions / total_predictions
    return accuracy


print(f"Accuracy: {calc_accuracy(preds, truths) * 100:.2f}%")

Accuracy: 49.06%


In [365]:
def convert_array(array):
    result = []
    length = len(array)

    for i in range(0, length, 8):
        sub_array = ["no" if value == 0 else "yes" for value in array[i:i+8]]
        result.append(sub_array)

    return result

new_preds = convert_array(preds)
new_truths = convert_array(truths)

In [366]:
rouge_metric = evaluate.load('rouge')

In [367]:
for i in range(len(new_preds)):
    rouge_metric.add_batch(predictions=new_preds[i], references=new_truths[i])

In [407]:
rouge_metric.compute()