In [None]:
!wget https://raw.githubusercontent.com/tezansahu/VQA-With-Multimodal-Transformers/main/dataset/data_train.csv
!wget https://raw.githubusercontent.com/tezansahu/VQA-With-Multimodal-Transformers/main/dataset/data_eval.csv
!wget https://raw.githubusercontent.com/tezansahu/VQA-With-Multimodal-Transformers/main/dataset/all_qa_pairs.txt
!wget https://raw.githubusercontent.com/tezansahu/VQA-With-Multimodal-Transformers/main/dataset/answer_space.txt
!mkdir datasets
!mv * ./datasets

In [None]:
!pip install datasets==1.17.0 nltk==3.5 pandas==1.3.5 Pillow==9.0.0 scikit-learn==0.23.2
!pip install torch transformers==4.14.0

# Multi Modal Transformers for VQA

https://medium.com/data-science-at-microsoft/visual-question-answering-with-multimodal-transformers-d4f57950c867

In [42]:
import os
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from datasets import load_dataset, set_caching_enabled
from PIL import Image
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModel, TrainingArguments, Trainer, logging
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# SET CACHE FOR HUGGINGFACE TRANSFORMERS + DATASETS
os.environ['HF_HOME'] = os.path.join(".", "cache")
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

set_caching_enabled(True)
logging.set_verbosity_error()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [121]:
pd.read_csv("./datasets/data_eval.csv")

Unnamed: 0,question,answer,image_id
0,what is the colour of the bag on the chair,pink,image399
1,what is at the right bottom,table,image1341
2,what are found on the rack,toy,image1320
3,what is left of printer,mirror,image529
4,what is the colour of television,black,image201
...,...,...,...
2489,what is beneath the monitor,paper,image528
2490,what is the largest object,bed,image1077
2491,what is on the right side of the tap,bottle_of_hand_wash_liquid,image695
2492,how many lights are above the table,3,image875


In [114]:
dataset = load_dataset(
    "csv", 
    data_files={"train": "./datasets/data_train.csv", "test": "./datasets/data_eval.csv"}
)

with open("./datasets/answer_space.txt") as f:
    answer_space = f.read().splitlines()

# VQA task as a multiclass classification problem, need to create the labels from the actual answers
dataset = dataset.map(
    # Select the 1st answer if multiple answers are provided for single question
    lambda examples: {'label': [answer_space.index(ans.replace(" ", "").split(",")[0]) for ans in examples['answer']]},
    batched=True
)



  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [120]:
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'image_id', 'label', 'question'],
        num_rows: 9974
    })
    test: Dataset({
        features: ['answer', 'image_id', 'label', 'question'],
        num_rows: 2494
    })
})

In [58]:
pd.read_csv('./data_eval.csv')

ParserError: ignored

In [48]:
### create test train set

image_pattern = re.compile("( (in |on |of )?(the |this )?(image\d*) \?)")

with open("all_qa_pairs.txt") as f:
    qa_data = [x.replace("\n", "") for x in f.readlines()]

df = pd.DataFrame({"question": [], "answer": [], "image_id":[]})

for i in range(0, len(qa_data), 2):
    img_id = image_pattern.findall(qa_data[i])[0][3]
    question = qa_data[i].replace(image_pattern.findall(qa_data[i])[0][0], "")
    record = {
        "question": question,
        "answer": qa_data[i+1],
        "image_id": img_id,
    }
    df = df.append(record, ignore_index=True)

# Create a list of all possible answers, so that the answer generation part of the VQA task 
# can be modelled as multiclass classification
answer_space = []
for ans in df.answer.to_list():
    answer_space = answer_space + [ans] if "," not in ans else answer_space + ans.replace(" ", "").split(",") 

answer_space = list(set(answer_space))
answer_space.sort()
with open(os.path.join("dataset", "answer_space.txt"), "w") as f:
    f.writelines("\n".join(answer_space))

# Since the actual dataset contains only ~54% of the data for training (very less),
# we produce our own splits for training & evaluation with 80% data being used for training
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.to_csv(os.path.join("dataset", "data_train.csv"), index=None)
test_df.to_csv(os.path.join("dataset", "data_eval.csv"), index=None)

IndexError: ignored