# Joint Embedding model on the Vizwiz VQA dataset

This code is based on this great article here:

https://medium.com/data-science-at-microsoft/visual-question-answering-with-multimodal-transformers-d4f57950c867

With (not limited to) the following modifications:
- Adapted to work with the Vizwiz VQA dataset
- Several model changes, the important ones are initialization and the loss function (Focal Loss)
- Focal loss has the option to add uniform label smoothing
- Different way of selecting target label
- Added script to load external data (COCO)
- Added the VQA accuracy metric and can be computed while training

Be aware that:
- This code has a cpu bottleneck and should be faster if solved, for example, preprocessing the images in another script instead of preprocessing it on the fly.
- For convience some parts of this code relies on global variables such as answer_space and the indices of the validation dataframe.

In [1]:
# Path configs
from pathlib import Path

ANNOTATIONS_BASE_PATH = "/kaggle/input/vizwiz/Annotations/Annotations/"
IMAGES_TRAIN_PATH = "/kaggle/input/vizwiz/train/train/"
IMAGES_VAL_PATH = "/kaggle/input/vizwiz/val/val/"
ANSWER_SPACE_EMBED_PATH = "/kaggle/input/answer-space-embed/answer_embed.pt"
ANNOTATIONS_TRAIN_PATH = str(Path(ANNOTATIONS_BASE_PATH)/"train.json")
ANNOTATIONS_VAL_PATH = str(Path(ANNOTATIONS_BASE_PATH)/"val.json")

ANSWER_SPACE_SIZE = 3000

In [2]:
import os
from copy import deepcopy
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from datasets import load_dataset, set_caching_enabled
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import pandas as pd
from transformers import (
    # Preprocessing / Common
    AutoTokenizer, AutoFeatureExtractor,
    # Text & Image Models (Now, image transformers like ViTModel, DeiTModel, BEiT can also be loaded using AutoModel)
    AutoModel,            
    # Training / Evaluation
    TrainingArguments, Trainer,
    # Misc
    logging
)

from sklearn.metrics import accuracy_score, f1_score
import wandb

In [3]:
# SET CACHE FOR HUGGINGFACE TRANSFORMERS + DATASETS
os.environ['HF_HOME'] = os.path.join(".", "cache")
# SET ONLY 1 GPU DEVICE
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

set_caching_enabled(True)
logging.set_verbosity_error()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
#     print('Memory Usage:')
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

cuda:0
Tesla P100-PCIE-16GB


  


In [4]:
import json
import datetime
from collections import Counter

class VQAData:
    def __init__(self, annotation_fp, image_base_path):
        # Read in json file as pd.dataframe
        self.df = pd.read_json(annotation_fp,orient="records")
        print("Read in file: {}".format(annotation_fp))
        print("File has shape: {}".format(self.df.shape))
        
        def most_common(lst):
            data = Counter(lst)
            return max(lst, key=data.get)
        
        # max_answer is the most common answer from the ten answers provided
        # 
        self.df["max_answer"] = self.df["answers"].apply(lambda row:most_common([ans["answer"] for ans in row]))
        self.df["max_answer_confidence"] = self.df["answers"].apply(lambda row:most_common([ans["answer_confidence"] for ans in row]))
        self.df["answer_list"] = self.df["answers"].apply(lambda row:[ans["answer"] for ans in row])
        image_base_path = Path(image_base_path)
        self.df["image_path"] = self.df["image"].apply(lambda row:str(image_base_path/row))
        
    def get_df(self,fields=["image_path","question","max_answer"], answer_type=None):
        if(answer_type!=None):
            _df = self.df[self.df["answer_type"]==answer_type].reset_index(drop=True)
        else:
            _df = self.df.copy()
        return(_df[fields])

In [5]:
vqa_train = VQAData(ANNOTATIONS_TRAIN_PATH,IMAGES_TRAIN_PATH)
vqa_val = VQAData(ANNOTATIONS_VAL_PATH,IMAGES_VAL_PATH)
train_df = vqa_train.get_df(fields=["image_path","question","max_answer","answer_list"])
val_df = vqa_val.get_df(fields=["image_path","question","max_answer","answer_list"])

Read in file: /kaggle/input/vizwiz/Annotations/Annotations/train.json
File has shape: (20523, 5)
Read in file: /kaggle/input/vizwiz/Annotations/Annotations/val.json
File has shape: (4319, 5)


### Why cast "unsuitable image" -> "unsuitable"?
In the paper that presented the VizWiz dataset, it defined 
- Unsuitable Image: an image is too poor in quality to answer the question (i.e., all white, all black, or too blurry)
- Unanswerable: the question cannot be answered from the image

But there is a high portion of answers labeled with "unsuitable" which doesn't lay in either category and is a significant noise to the model causing the accuracy to degrade. (Simply thought, even if the model is 100% sure that this is a unsuitable image, it still has a 50% to get it wrong because of the mixing of "unsuitable image" and "unsuitable" labels")

In this notebook we modify both the train/validation set labels to get a clear idea of how the model is training, but didn't modify the final test labels for fair comparison.

In [6]:
# Cast "unsuitable image" -> "unsuitable"
def clean_answer(word):
    if word == "unsuitable image":
        return "unsuitable"
    else:
        return word
train_df["answer_list"] = train_df["answer_list"].apply(lambda ans:[clean_answer(w) for w in ans])
val_df["answer_list"] = val_df["answer_list"].apply(lambda ans:[clean_answer(w) for w in ans])

### Why is ANSWER_SPACE_SIZE defaulted to 3000?
We treat the VizWiz VQA task as a multiple-choice problem with a predefined answer space of the 3000 most frequent words, if the label doesn't exist in this set we assign it a **label "\<Unknown\>"**.

When selecting 3000 as the ANSWER_SPACE_SIZE, 96.6% of the training data and 96.7% of the validation data has at least one answer in the answer space, and the max accuracy achievable on the validation set 88.13%. *(Note that the formula for VQA accuracy is different from standard accuracy)*. We see this as an acceptable trade off since increasing the answer space without additional data could make it more difficult for the model to learn and converge.

In [7]:
total_answer_counts = train_df["answer_list"].append(val_df["answer_list"]).explode().value_counts()
print("The total count of answer types are: {}".format(total_answer_counts.shape))
print("Picking the top {} as answer space (and +1 for unknown).".format(ANSWER_SPACE_SIZE))

answer_space = total_answer_counts.head(ANSWER_SPACE_SIZE).index.to_list()
answer_space.append("<Unknown>")

The total count of answer types are: (48729,)
Picking the top 3000 as answer space (and +1 for unknown).


### Load external vqa data into dataset
-----
The following script loads the COCO dataset to augment the training data. Is not used because it significantly prolongs the training time.

In [8]:
# !wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip -P data/
# !wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip -P data/
# !unzip -o -q ./data/v2_Questions_Train_mscoco.zip -d data/
# !unzip -o -q ./data/v2_Annotations_Train_mscoco.zip -d data/

In [9]:
# VQA_IMAGES_TRAIN_PATH = "/kaggle/input/coco-2014-dataset-for-yolov3/coco2014/images/train2014"
# with open('./data/v2_OpenEnded_mscoco_train2014_questions.json') as fh:
#     vqa_train_question_dict = json.load(fh)
# with open('./data/v2_mscoco_train2014_annotations.json') as fh:
#     vqa_train_annotation_dict = json.load(fh)
    
# vqa_image_base_path = Path(VQA_IMAGES_TRAIN_PATH)
# vqa_train_question_df = pd.DataFrame(vqa_train_question_dict["questions"])
# vqa_train_annotation_df = pd.DataFrame(vqa_train_annotation_dict["annotations"])
# vqa_train_annotation_df = vqa_train_annotation_df.drop_duplicates("image_id")
# vqa_train_annotation_df["answer_list"] = vqa_train_annotation_df["answers"].apply(lambda row:[ans["answer"] for ans in row])

# vqa_train_df = vqa_train_annotation_df.join(vqa_train_question_df[["question","question_id"]].set_index("question_id"),on="question_id")
# vqa_train_df["image_path"] = vqa_train_df["image_id"].apply(lambda i: str(vqa_image_base_path/"COCO_train2014_{:012d}.jpg".format(i)))

# def most_common(lst):
#     data = Counter(lst)
#     return max(lst, key=data.get)
# vqa_train_df["max_answer"] = vqa_train_df["answers"].apply(lambda row:most_common([ans["answer"] for ans in row]))
# vqa_train_df = vqa_train_df[["image_path","question","max_answer","answer_list"]]

# train_df = train_df.append(vqa_train_df)

-----

### Max_answer? Selected_answer?
If you read through the code line by line you might realize there is a max_answer field and wonder what the difference is. The max_answer field simply selects the most common answer within the ten given answers and is ok to use as the target label after filtering it to match the answer_space and is the first target label  we tried out. The problem is that it creates too much "\<Unknown\>" labels after filtering it with the answer_space which is not ideal.
    
Selected_answer below solves the problem by filtering the ten answers first, then picks the most common answer out of the valid answers. So even if only 1 out of the 10 answers is in the answer_space, we can at least assign it a target label that is weakly relevant instead of assigning it to an entirely irrelevant "\<Unknown\>" label. Using the selected_answer label does improve the performance.

In [10]:
from collections import Counter
def choose_ans_label(ans_list):
    ans_dict = Counter(ans_list)
    max_entry = "<Unknown>"
    max_count = 0
    for k,v in ans_dict.items():
        if k in answer_space and v>max_count:
            max_count = v
            max_entry = k
    return max_entry

In [11]:
train_df["selected_answer"] = train_df["answer_list"].apply(lambda row:choose_ans_label(row))
val_df["selected_answer"] = val_df["answer_list"].apply(lambda row:choose_ans_label(row))

In [12]:
# encode labels
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories = [answer_space[:-1]],handle_unknown="use_encoded_value",unknown_value=ANSWER_SPACE_SIZE)
enc.fit(train_df["selected_answer"].values.reshape(-1,1))
train_df["labels"] = enc.transform(train_df["selected_answer"].values.reshape(-1,1)).astype(int)
val_df["labels"] = enc.transform(val_df["selected_answer"].values.reshape(-1,1)).astype(int)

# remove unknown
# keeping or removing unknown does not have significant effect if there isn't many <Unknowns>
train_df = train_df[train_df["labels"]!=ANSWER_SPACE_SIZE]

# set index for val metrics
# the indices here are accessed by the vqa_accuracy_score() to calculate validation metrics while training
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
train_df["id"] = train_df.index
val_df["id"] = val_df.index

In [13]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [14]:
@dataclass
class MultimodalCollator:
    tokenizer: AutoTokenizer
    preprocessor: AutoFeatureExtractor

    def tokenize_text(self, texts: List[str]):
        encoded_text = self.tokenizer(
            text=texts,
            padding='longest',
            max_length=68,
            truncation=True,
            return_tensors='pt',
            return_token_type_ids=True,
            return_attention_mask=True,
        )
        return {
            "input_ids": encoded_text['input_ids'].squeeze(),
            "token_type_ids": encoded_text['token_type_ids'].squeeze(),
            "attention_mask": encoded_text['attention_mask'].squeeze(),
        }

    def preprocess_images(self, images: List[str]):
        processed_images = self.preprocessor(
            images=[Image.open(image_path).convert('RGB') for image_path in images],
            return_tensors="pt",
        )
        return {
            "pixel_values": processed_images['pixel_values'].squeeze(),
        }
            
    def __call__(self, raw_batch_dict):
        return {
            **self.tokenize_text(
                raw_batch_dict['question']
                if isinstance(raw_batch_dict, dict) else
                [i['question'] for i in raw_batch_dict]
            ),
            **self.preprocess_images(
                raw_batch_dict['image_path']
                if isinstance(raw_batch_dict, dict) else
                [i['image_path'] for i in raw_batch_dict]
            ),
            'labels': torch.tensor(
                [raw_batch_dict['labels'],raw_batch_dict["id"]]
                if isinstance(raw_batch_dict, dict) else
                [[i['labels'],i['id']] for i in raw_batch_dict],
                dtype=torch.int64
            ),
        }

In [15]:
from typing import Optional, Sequence

import torch
from torch import Tensor
from torch import nn
from torch.nn import functional as F

# Based on https://github.com/AdeelH/pytorch-multi-class-focal-loss/blob/master/focal_loss.py
# Added label smoothing
class FocalLoss(nn.Module):
    """ Focal Loss, as described in https://arxiv.org/abs/1708.02002.
    It is essentially an enhancement to cross entropy loss and is
    useful for classification tasks when there is a large class imbalance.
    x is expected to contain raw, unnormalized scores for each class.
    y is expected to contain class labels.
    Shape:
        - x: (batch_size, C) or (batch_size, C, d1, d2, ..., dK), K > 0.
        - y: (batch_size,) or (batch_size, d1, d2, ..., dK), K > 0.
    """

    def __init__(self,
                 alpha: Optional[Tensor] = None,
                 gamma: float = 0.,
                 reduction: str = 'mean',
                 label_smoothing: float = 0.0,
                 ignore_index: int = -100):
        """Constructor.
        Args:
            alpha (Tensor, optional): Weights for each class. Defaults to None.
            gamma (float, optional): A constant, as described in the paper.
                Defaults to 0.
            reduction (str, optional): 'mean', 'sum' or 'none'.
                Defaults to 'mean'.
            ignore_index (int, optional): class label to ignore.
                Defaults to -100.
        """
        if reduction not in ('mean', 'sum', 'none'):
            raise ValueError(
                'Reduction must be one of: "mean", "sum", "none".')

        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ignore_index = ignore_index
        self.reduction = reduction
        self.label_smoothing = label_smoothing

        self.nll_loss = nn.NLLLoss(
            weight=alpha, reduction='none', ignore_index=ignore_index)

    def __repr__(self):
        arg_keys = ['alpha', 'gamma', 'ignore_index', 'reduction']
        arg_vals = [self.__dict__[k] for k in arg_keys]
        arg_strs = [f'{k}={v}' for k, v in zip(arg_keys, arg_vals)]
        arg_str = ', '.join(arg_strs)
        return f'{type(self).__name__}({arg_str})'

    def forward(self, x: Tensor, y: Tensor) -> Tensor:
        if x.ndim > 2:
            # (N, C, d1, d2, ..., dK) --> (N * d1 * ... * dK, C)
            c = x.shape[1]
            x = x.permute(0, *range(2, x.ndim), 1).reshape(-1, c)
            # (N, d1, d2, ..., dK) --> (N * d1 * ... * dK,)
            y = y.view(-1)

        unignored_mask = y != self.ignore_index
        y = y[unignored_mask]
        if len(y) == 0:
            return torch.tensor(0.)
        x = x[unignored_mask]

        # compute weighted cross entropy term: -alpha * log(pt)
        # (alpha is already part of self.nll_loss)
        log_p = F.log_softmax(x, dim=-1)
        
        if self.label_smoothing==0:
            ce = self.nll_loss(log_p, y)
        else:
            ce = (1 - self.label_smoothing)*self.nll_loss(log_p, y) - self.label_smoothing * log_p.mean(dim=-1)

        # get true class column from each row
        all_rows = torch.arange(len(x))
        log_pt = log_p[all_rows, y]

        # compute focal term: (1 - pt)^gamma
        pt = log_pt.exp()
        focal_term = (1 - pt)**self.gamma

        # the full loss: -alpha * ((1 - pt)^gamma) * log(pt)
        loss = focal_term * ce

        if self.reduction == 'mean':
            loss = loss.mean()
        elif self.reduction == 'sum':
            loss = loss.sum()

        return loss

In [16]:
total_answer_counts = train_df["answer_list"].append(val_df["answer_list"]).explode().value_counts()
total_answer_counts = total_answer_counts[total_answer_counts.index.isin(answer_space)][answer_space[:-1]]

# freq_weight(alpha) is decided by trial and error and has no theory to support the decision
freq_weight = 1/np.log(total_answer_counts.head(ANSWER_SPACE_SIZE).values+10)
freq_weight = torch.tensor(np.append(freq_weight/np.mean(freq_weight),[0]),dtype=torch.float)

focal_loss = FocalLoss(
        alpha=freq_weight,
        gamma=2,
        reduction='mean',
        label_smoothing=0.0)

print(freq_weight)

tensor([0.3123, 0.3160, 0.3737,  ..., 1.1365, 1.1365, 0.0000])


# Normal model
-----

In [17]:
from torch.nn.utils.weight_norm import weight_norm

class MultimodalVQAModel(nn.Module):
    def __init__(
            self,
            num_labels: int = len(answer_space),
            intermediate_dim: int = 512,
            pretrained_text_name: str = 'bert-base-uncased',
            pretrained_image_name: str = 'google/vit-base-patch16-224-in21k'):
     
        super(MultimodalVQAModel, self).__init__()
        self.num_labels = num_labels
        self.pretrained_text_name = pretrained_text_name
        self.pretrained_image_name = pretrained_image_name
        
        self.text_encoder = AutoModel.from_pretrained(
            self.pretrained_text_name,
        )
        self.image_encoder = AutoModel.from_pretrained(
            self.pretrained_image_name,
        )
        
#         for param in self.text_encoder.base_model.parameters():
#             param.requires_grad = False
#         for param in self.image_encoder.base_model.parameters():
#             param.requires_grad = False
        
        # https://github.com/jiasenlu/vilbert_beta/blob/master/vilbert/basebert.py
        hidden_size = self.text_encoder.config.hidden_size + self.image_encoder.config.hidden_size
        intermediate_dim = hidden_size*2
        self.fusion = nn.Sequential(
            nn.Dropout(0.1),
            weight_norm(nn.Linear(hidden_size, intermediate_dim)),
            nn.ReLU(),
            nn.Dropout(0.1),
        )
        
        self.classifier = nn.Linear(intermediate_dim, self.num_labels)
        
        for m in self.fusion:
            if isinstance(m, nn.Linear):
                torch.nn.init.kaiming_normal_(m.weight)
                torch.nn.init.normal_(m.bias,std=0.03)
        
        try:
            for m in self.classifier:
                if isinstance(m, nn.Linear):
                    torch.nn.init.kaiming_normal_(m.weight)
                    torch.nn.init.normal_(m.bias,std=0.03)
        except:
            torch.nn.init.kaiming_normal_(self.classifier.weight)
            torch.nn.init.normal_(self.classifier.bias,std=0.03)
        
        
        self.criterion = focal_loss
    
    def forward(
            self,
            input_ids: torch.LongTensor,
            pixel_values: torch.FloatTensor,
            attention_mask: Optional[torch.LongTensor] = None,
            token_type_ids: Optional[torch.LongTensor] = None,
            labels: Optional[torch.LongTensor] = None):
        
        encoded_text = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True,
        )
        encoded_image = self.image_encoder(
            pixel_values=pixel_values,
            return_dict=True,
        )
        
        fused_output = self.fusion(
            torch.cat(
                [
                    encoded_text['pooler_output'],
                    encoded_image['pooler_output'],
                ],
                dim=1
            )
        )
        logits = self.classifier(fused_output)
        
        out = {
            "logits": logits
        }
        if labels is not None:
            loss = self.criterion(logits, labels[:,0])
            out["loss"] = loss
        
        return out
    
print("Loaded normal model")

Loaded normal model


# Choice embedding based
-----
In order to run this model, the embeddings of the output choices must be generated from another script and loaded into this kaggle notebook with the path ANSWER_SPACE_EMBED_PATH due to memory constraints.

```
# Sample code to generate answer embeddings
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

word_dl = DataLoader(answer_space, batch_size=10, shuffle=False)
model.to(device)
answer_embedding = []
for i,word_list in enumerate(word_dl):
    encoded_input = tokenizer(word_list,padding=True, return_tensors='pt').to(device)
    output = model(**encoded_input)
    answer_embedding.append(output["pooler_output"])
answer_tensor = torch.cat(answer_embedding,dim=0)
torch.save(answer_tensor, 'answer_embed.pt')
```

In [18]:
%%script false --no-raise-error

# The above magic skips this block when running

from torch.nn.utils.weight_norm import weight_norm

class MultimodalVQAModel(nn.Module):
    def __init__(
            self,
            num_labels: int = len(answer_space),
            intermediate_dim: int = 512,
            pretrained_text_name: str = 'bert-base-uncased',
            pretrained_image_name: str = 'google/vit-base-patch16-224-in21k'):
     
        super(MultimodalVQAModel, self).__init__()
        self.num_labels = num_labels
        self.pretrained_text_name = pretrained_text_name
        self.pretrained_image_name = pretrained_image_name
        
        self.text_encoder = AutoModel.from_pretrained(
            self.pretrained_text_name,
        )
        self.image_encoder = AutoModel.from_pretrained(
            self.pretrained_image_name,
        )
        
#         for param in self.text_encoder.base_model.parameters():
#             param.requires_grad = False
#         for param in self.image_encoder.base_model.parameters():
#             param.requires_grad = False
        
        # https://github.com/jiasenlu/vilbert_beta/blob/master/vilbert/basebert.py
        hidden_size = self.text_encoder.config.hidden_size + self.image_encoder.config.hidden_size
        intermediate_dim = hidden_size*2
        self.fusion = nn.Sequential(
            nn.Dropout(0.1),
#             weight_norm(nn.Linear(hidden_size, intermediate_dim)),
            nn.Linear(hidden_size, intermediate_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(intermediate_dim, self.text_encoder.config.hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        
        for m in self.fusion:
            if isinstance(m, nn.Linear):
                torch.nn.init.kaiming_normal_(m.weight)
                torch.nn.init.normal_(m.bias,std=0.03)
        for m in self.classifier:
            if isinstance(m, nn.Linear):
                torch.nn.init.kaiming_normal_(m.weight)
                torch.nn.init.normal_(m.bias,std=0.03)
        
        self.answer_embed_weights = nn.Parameter(torch.ones((3001,768)))
        with torch.no_grad():
            self.answer_embed_weights.copy_(torch.load(ANSWER_SPACE_EMBED_PATH))
        print(self.answer_embed_weights)
        self.output_bias = nn.Parameter(torch.zeros((self.num_labels,)))
        nn.init.normal_(self.output_bias,std=0.03)
        
        self.criterion = focal_loss
    
    def forward(
            self,
            input_ids: torch.LongTensor,
            pixel_values: torch.FloatTensor,
            attention_mask: Optional[torch.LongTensor] = None,
            token_type_ids: Optional[torch.LongTensor] = None,
            labels: Optional[torch.LongTensor] = None):
        
        encoded_text = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True,
        )
        encoded_image = self.image_encoder(
            pixel_values=pixel_values,
            return_dict=True,
        )
        
        fused_output = self.fusion(
            torch.cat(
                [
                    encoded_text['pooler_output'],
                    encoded_image['pooler_output'],
                ],
                dim=1
            )
        )
        model_embeddings = self.classifier(fused_output)

        logits = (self.answer_embed_weights.unsqueeze(0) @ model_embeddings.unsqueeze(-1)).squeeze()+self.output_bias
        
        out = {
            "logits": logits
        }
        if labels is not None:
            loss = self.criterion(logits, labels[:,0])
            out["loss"] = loss
        
        return out

print("Loaded embedding model")

In [19]:
def createMultimodalVQACollatorAndModel(text='bert-base-uncased', image='google/vit-base-patch16-224-in21k'):
    tokenizer = AutoTokenizer.from_pretrained(text)
    preprocessor = AutoFeatureExtractor.from_pretrained(image)

    multi_collator = MultimodalCollator(
        tokenizer=tokenizer,
        preprocessor=preprocessor,
    )


    multi_model = MultimodalVQAModel(pretrained_text_name=text, pretrained_image_name=image).to(device)
#     print(multi_model)
    return multi_collator, multi_model

In [20]:
def vqa_accuracy_score(val_id, preds):
    acc_sum = 0
    for v_id, pred in zip(val_id,preds):
        pred_word = answer_space[pred]
        answer_list = val_df.at[v_id,'answer_list']
        acc_sum+=min(answer_list.count(pred_word)/3,1)
    return acc_sum/len(preds)

def compute_metrics(eval_tuple: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
    logits, labels = eval_tuple
    preds = logits.argmax(axis=-1)
    return {
        "acc": accuracy_score(labels[:,0], preds),
        "vqa_acc": vqa_accuracy_score(labels[:,1], preds),
        "f1": f1_score(labels[:,0], preds, average='macro')
    }

In [21]:
args = TrainingArguments(
    output_dir="answer_embed_into_model",
    seed=2022, 
#     evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=620,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=620,
#     save_strategy="epoch",
#     save_steps=100,
    save_total_limit=1,             # Save only the last 1 checkpoints at any given time while training 
    metric_for_best_model='vqa_acc',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    remove_unused_columns=False,
    num_train_epochs=20,
    fp16=True,
    
    dataloader_pin_memory=False,
    
    log_level="warning",
    
    warmup_ratio=0.06,
    learning_rate=1e-5,
    weight_decay=0.1,
    
    dataloader_num_workers=2,
    load_best_model_at_end=True,
)

In [22]:
def createAndTrainModel(dataset, args, text_model='bert-base-uncased', image_model='google/vit-base-patch16-224-in21k', multimodal_model='bert_vit'):
    collator, model = createMultimodalVQACollatorAndModel(text_model, image_model)
    
    multi_args = deepcopy(args)
    multi_args.output_dir = os.path.join(".", "checkpoint", multimodal_model)
    multi_trainer = Trainer(
        model,
        multi_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['val'],
        data_collator=collator,
        compute_metrics=compute_metrics
    )
    
    train_multi_metrics = multi_trainer.train()
    eval_multi_metrics = multi_trainer.evaluate()
    # Remember to change this if you want to pass a different dataset to predict
    test_results = multi_trainer.predict(dataset['val'])
    
    return collator, model, train_multi_metrics, eval_multi_metrics, test_results

In [23]:
%env WANDB_PROJECT=vizwiz_results
# Set the WANDB_API_KEY to avoid manually setting it every time 
# %env WANDB_API_KEY=
%env WANDB_WATCH=all

env: WANDB_PROJECT=vizwiz_results
env: WANDB_WATCH=all


In [24]:
wandb.init(name = "Basic model", notes="Basic model. dropout(0.1) lr(1e-5)", save_code=True, reinit=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [25]:
collator, model, train_multi_metrics, eval_multi_metrics, test_results = createAndTrainModel({"train": train_dataset,"val": val_dataset}, args,
                                                                              text_model='roberta-base', image_model="microsoft/beit-base-patch16-224-pt22k-ft22k")

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/276 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/395M [00:00<?, ?B/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


{'loss': 4.5343, 'learning_rate': 1.3440860215053765e-06, 'epoch': 0.16}
{'loss': 4.4104, 'learning_rate': 2.688172043010753e-06, 'epoch': 0.32}
{'loss': 3.9507, 'learning_rate': 4.018817204301075e-06, 'epoch': 0.48}
{'loss': 3.6595, 'learning_rate': 5.362903225806452e-06, 'epoch': 0.65}
{'loss': 3.567, 'learning_rate': 6.706989247311828e-06, 'epoch': 0.81}
{'loss': 3.6632, 'learning_rate': 8.051075268817205e-06, 'epoch': 0.97}
{'eval_loss': 2.961859941482544, 'eval_acc': 0.31095160916878906, 'eval_vqa_acc': 0.43466851894728614, 'eval_f1': 0.0013356211037317367, 'eval_runtime': 175.4358, 'eval_samples_per_second': 24.619, 'eval_steps_per_second': 0.77, 'epoch': 1.0}
{'loss': 3.3609, 'learning_rate': 9.395161290322582e-06, 'epoch': 1.13}
{'loss': 3.1942, 'learning_rate': 9.952814001372685e-06, 'epoch': 1.29}
{'loss': 3.2396, 'learning_rate': 9.867021276595746e-06, 'epoch': 1.45}
{'loss': 3.1683, 'learning_rate': 9.781228551818806e-06, 'epoch': 1.61}
{'loss': 3.0269, 'learning_rate': 9.6

KeyboardInterrupt: 

In [None]:
val_df["prediction_label"] = test_results.predictions.argmax(1)
val_df["answer"] = val_df["prediction_label"].apply(lambda i: answer_space[i])
val_df["image"] = val_df["image_path"].apply(lambda x:x.split("/")[-1])
val_df[["image","answer"]].to_json("./val_real_results.json",orient = "records")

# Saving predictions to wandb
wandb.save('./val_real_results.json', policy="now")
wandb.finish()