In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iust-vqa/image_features.pickle
/kaggle/input/iust-vqa/answer_list.txt
/kaggle/input/iust-vqa/val.csv
/kaggle/input/iust-vqa/train.csv
/kaggle/input/iust-vqa/test.csv
/kaggle/input/iust-vqa/image_question.json


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from transformers import BertModel, BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

bert = BertModel.from_pretrained("bert-base-uncased")
embedding_matrix = bert.embeddings.word_embeddings.weight

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
PAD_ID = 0
CLS_ID = 101
device = "cuda:0"
# device = "cpu"

In [5]:
input_text = "Here is some text to encode"
input_ids = tokenizer.encode(input_text, add_special_tokens=True)
# you can get BERT embeddings like this:
embedding_matrix[input_ids].shape, input_ids

(torch.Size([9, 768]), [101, 2182, 2003, 2070, 3793, 2000, 4372, 16044, 102])

In [6]:
#Let's begin !
from torch.utils.data import Dataset
import pickle
import json
import csv 
import torch

class VQADataset(Dataset):

    def __init__(self, split_path):
        image_features_path = "/kaggle/input/iust-vqa/image_features.pickle"
        answers_list_path = "/kaggle/input/iust-vqa/answer_list.txt"
        image2questions_path = "/kaggle/input/iust-vqa/image_question.json"
        
        ## Read image features, use pickle!
        with open(image_features_path, 'rb') as f:
            ### YOUR CODE HERE
            self.image_features = pickle.load(f)
            ### YOUR CODE HERE
        
        ##sample: self.question2img[q_id] = img_id
        self.question2img = {}
        
        ##sample: self.questions[q_id] = {"text" : q_text, "tokenized" : tokenized_question}
        ## tokenization: tokenizer.encode(sentence)
        self.questions = {}
        
        with open(image2questions_path, 'r') as f:
            ## YOUR CODE HERE
            ## Load json file (image2questions)
            data = json.load(f)
            
            ## retrieve requested values "self.question2img", "self.questions" from givenn json
            ## ~ 6 lines
            for k in data.keys():
                for q in range(len(data[k])):                    
                    self.question2img[data[k][q][0]] = k
                    self.questions[data[k][q][0]] = {
                        "text": data[k][q][1],
                        "tokenized": tokenizer.encode(data[k][q][1])
                    }
                    

                
            ### YOUR CODE HERE
        
        self.possible_answers = []
        with open(answers_list_path, 'r') as f:
            ## read answers list from text file, save them in an array
            self.possible_answers = f.read().split()
        
        ## sample: self.data[idx] = q_id
        self.data = []
        ## sample: self.labels[idx] = 4
        self.labels = []
        
        
        
        ## load data from "split_path", fill self.data and self.labels as requested! take a look at train.csv
        # https://docs.python.org/3/library/csv.html#csv.DictReader
        with open(split_path, newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                self.data.append(int(row['question_id']))
                if (row['label'] is not None):
                    self.labels.append(int(row['label']))
                else:
                    self.labels = None

    def __getitem__(self, idx):
        """This method returns tuple of (question_id, image_features (Tensor), tokenized_question (Tensor), label
        
        Note: label can be None!
        """
        
        
        q_id = self.data[idx]
        ### YOUR CODE HERE
        ## WARNING: while making tensors, DO NOT FORGET TO USE .to(device) at the end!
        
        q_tokenized = torch.tensor(self.questions[q_id]['tokenized']).to(device)
        img_id = self.question2img[q_id]
        label = None
        if(self.labels  is not None):
            label = self.labels[idx]
        return q_id, torch.tensor(self.image_features[img_id]).to(device), q_tokenized, label
        ### YOUR CODE HERE
    
    def __len__(self):
        return len(self.data)

In [7]:
from torch.nn.utils.rnn import pad_sequence


def collate_batch(batch):
    """
        Batch post processing, we can pad questions! 
        returns q_ids, images (Tensor), questions(Tensor), labels (Tensor)
    """
    images = []
    questions = []
    labels = []
    q_ids = []
    
    ### YOUR CODE HERE
    ## WARNING: while making tensors, DO NOT FORGET TO USE .to(device) at the end!
    for q_id, img, q_tokens, label in batch:
        images.append(img)
        q_ids.append(q_id)
        if label is not None:
            labels.append(torch.tensor(label).to(device))
        else:
            labels = None
        questions.append(q_tokens)
    
    ### Stack images into one tensor
    ## torch.stack, shape must be (batch_size, img_features)
    images = torch.stack(images,dim=0).to(device)
    
    ## stack labels if they're not None, else make labels None!
    if labels is not None:
        labels = torch.stack(labels,dim=0).to(device)
    else:
        labels = None
    
    ## pad questions, shape must be (batch_size, longest_sentence)
    ## https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html
    questions = pad_sequence(questions, padding_value=0, batch_first=True).to(device)
    
    
    return q_ids, images, questions, labels

In [8]:
from torch.utils.data import DataLoader
dset = VQADataset("/kaggle/input/iust-vqa/train.csv")
data_loader_train = DataLoader(dset, collate_fn=collate_batch, batch_size=32)

In [9]:
from torch import nn

## Nothing, just look =)))

class PositionalEncoder(nn.Module):
    """Positional encoding class pulled from the PyTorch documentation tutorial
    on Transformers for seq2seq models:
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoder, self).__init__()

        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float()\
                             * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [10]:
from torch import nn
import math

#The most interesting part!

class VQA_Simple(nn.Module):
    def __init__(self, dropout, text_hidden_size, n_layers, n_heads, image_hidden_size, n_outputs):
        super().__init__()
        self.dropout = dropout
        self.d_model = text_hidden_size
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.image_hidden_size = image_hidden_size
        self.PAD = PAD_ID
        
        self.embedding_matrix = bert.embeddings.word_embeddings.weight
        
        
        ##initilize TransformerEncoderLayer
        ##https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html
        encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model= self.d_model,
            nhead= self.n_heads,
            dropout = self.dropout
        )
        
        ##initilize TransformerEncoder
        ##https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html
        self.t_encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=self.n_layers).to(device)
        
        ##if you looke enough, you can initilize positional encoder!!
        self.pe = PositionalEncoder(self.d_model, dropout=self.dropout)
        
        ##initilize TransformerDecoderLayer
        ##https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoderLayer.html
        decoder_layer = torch.nn.TransformerDecoderLayer(
            d_model= self.d_model,
            nhead= self.n_heads,
            dropout = self.dropout
        )
        
        ##initilize TransformerDecoder
        ##https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html
        self.t_decoder = torch.nn.TransformerDecoder(decoder_layer, num_layers=self.n_layers)
        
        ##Linear output, recieves concatenation of text and image features, outputs final answer!
        self.fc_layers = nn.Sequential(
                        nn.Linear(self.d_model + self.image_hidden_size, 512),
                        nn.BatchNorm1d(512),
                        nn.Dropout(0.1),
                        nn.Linear(512, 256),
                        nn.BatchNorm1d(256),
                        nn.Dropout(0.1),
                        nn.Linear(256, 64),
                        nn.BatchNorm1d(64),
                        nn.Dropout(0.1),
                        nn.Linear(64, n_outputs)
                        )
#         self.linear1 = torch.nn.Linear(self.d_model + self.image_hidden_size, n_outputs)
        
    def forward(self, images, input_ids):
        ##images shape: (batch_size, img_features)
        ##input_ids shape: (batch_size, sequence_len)
        b_size = images.shape[0]
        
        ### YOUR CODE HERE
        
        ## Calculate masks, shape: (batch_size, sequence_len)
        src_key_mask = (input_ids == self.PAD)
        ##embeddings of the given input_ids, extracted from self.embedding_matrix
        ##shape should be (sequence_len, batch_size, text_embedding_features)
        embeddings = self.embedding_matrix[input_ids]
        embeddings = torch.permute(embeddings, (1, 0, 2))

        ##Positional embeddings
        ##shape should be (sequence_len, batch_size, text_embedding_features)
        positional_embeddings = self.pe(embeddings)
        
        ## feed positinal_embeddings to the encoder!
        ## output shape should be (sequence_len, batch_size, d_model)
        ## additional args:  src_key_padding_mask
        encoder_output = self.t_encoder(positional_embeddings, src_key_padding_mask=src_key_mask)

        ##(batch_size, 1)
        tgt = torch.tensor([CLS_ID] * b_size).unsqueeze(1).to(device)
        ##(batch_size, 1)
        tgt_key_padding_mask = (tgt == self.PAD)
                
        ##embeddings of the given input_ids, extracted from self.embedding_matrix
        ##shape should be (1, batch_size, text_embedding_features)
        tgt_embeddings = self.embedding_matrix[tgt]
        tgt_embeddings = torch.permute(tgt_embeddings, (1, 0, 2))
    
        # target attention masks to avoid future tokens in our predictions
        # Adapted from PyTorch source code:
        # https://github.com/pytorch/pytorch/blob/176174a68ba2d36b9a5aaef0943421682ecc66d4/torch/nn/modules/transformer.py#L130
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(1).to(device)
        
        ## Positional embedding 
        tgt_positions = self.pe(tgt_embeddings)      
        output = self.t_decoder(tgt=tgt_positions, 
                                memory=encoder_output,
                                tgt_mask=tgt_mask,
                                tgt_key_padding_mask = tgt_key_padding_mask, 
                                memory_key_padding_mask = src_key_mask) ##(1, batch_size, text_embedding_features)
        
        
        
        output_text = output.permute(1, 0, 2).squeeze(1).to(device) ## (batch_size, text_embedding_features)
        
        ##https://pytorch.org/docs/stable/generated/torch.cat.html
        #concatenate text output and image features
        concatenated = torch.cat((output_text, images), dim=1).to(device)        
        
        y = self.fc_layers(concatenated).to(device)
        
        return y


In [11]:
from tqdm import tqdm
model = VQA_Simple(dropout=0.1, 
                   text_hidden_size=768, 
                   n_layers=4, 
                   n_heads=8, 
                   image_hidden_size=512, 
                   n_outputs=10).cuda()
lr=1e-4
epochs = 100
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(list(model.parameters()), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for i, (q_ids, images, questions, labels) in enumerate(pbar := tqdm(data_loader_train, total=len(data_loader_train))):
        pbar.set_description(f"Epoch {epoch}")
        
        optimizer.zero_grad()
        output = model(images, questions)
#         print("labels",labels.device)
#         print("shape of q_ids: ", q_ids)
#         print("shape of images: ", images.size())
#         print("shape of questions: ", questions.size())
#         print("shape of labels: ", labels.size())
        
        loss = criterion(output, labels)
        running_loss += loss
        
        loss.backward()
        optimizer.step()
        log_interval = 5
        pbar.set_postfix(loss=running_loss/(i+1))
    scheduler.step()
        

Epoch 0: 100%|██████████| 25/25 [00:04<00:00,  5.69it/s, loss=tensor(2.4158, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 1: 100%|██████████| 25/25 [00:01<00:00, 13.77it/s, loss=tensor(2.3276, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 2: 100%|██████████| 25/25 [00:01<00:00, 13.69it/s, loss=tensor(2.2175, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 3: 100%|██████████| 25/25 [00:01<00:00, 13.76it/s, loss=tensor(2.1774, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 4: 100%|██████████| 25/25 [00:01<00:00, 13.78it/s, loss=tensor(2.1044, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 5: 100%|██████████| 25/25 [00:01<00:00, 13.31it/s, loss=tensor(1.9289, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 6: 100%|██████████| 25/25 [00:01<00:00, 13.76it/s, loss=tensor(1.7301, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 7: 100%|██████████| 25/25 [00:01<00:00, 13.84it/s, loss=tensor(1.5618, device='cuda:0', grad_fn=<DivBackward0>)]
Epoch 8: 100%|██████████| 25/25 [00:01<00:00, 13

In [12]:
def predict(data_loader, net):
    predicts = []
    ids = []
    net.eval()
    for i, (q_ids, images, questions, _) in enumerate(pbar := tqdm(data_loader, total=len(data_loader))):
        outputs = net(images, questions)
        outputs = torch.argmax(outputs, dim=1)
        predicts.extend(outputs.tolist())
        ids.extend(q_ids)
    return predicts, ids


test_dset = VQADataset("/kaggle/input/iust-vqa/test.csv")
data_loader_test = DataLoader(test_dset, collate_fn=collate_batch, batch_size=8)
preds, ids = predict(data_loader_test, model)

# with open("output.txt")

100%|██████████| 14/14 [00:00<00:00, 67.13it/s]


In [13]:
import pandas as pd

output_data = {"question_id": [str(id) for id in ids], "label": preds}
df = pd.DataFrame(output_data)
df.to_csv("/kaggle/working/output21.csv", index=False)