In [6]:
import random
import skimage.io as io
import matplotlib.pyplot as plt
import os
import json
import numpy as np
from tqdm import tqdm
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *
from tensorflow.keras import Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.optimizers import Adam

In [2]:
img_size = 224 # inception default which seems to also be smaller than all the images
img_input_shape = (img_size, img_size, 3)
img_target_size = (img_size, img_size)

num_words = 1000
pad_size = 20
qstn_input_shape = (pad_size, 1)

learning_rate = 1e-3
batch_size = 64
epochs=1000

In [3]:
id_tuples = []
with open('./VQA/Annotations/v2_mscoco_train2014_annotations.json') as f:
    data = json.load(f)
    for annotation in data['annotations']:
        if annotation['answer_type'] == 'yes/no':
            id_tuples.append((annotation['image_id'], annotation['question_id'],  annotation['multiple_choice_answer']))   

questions = {}
with open('./VQA/Questions/v2_OpenEnded_mscoco_train2014_questions.json') as f:
    data = json.load(f)
    for question in data['questions']:
        questions[question['question_id']] = question['question']
        
train_data = []
for id_tuple in tqdm(id_tuples):
    question = questions[id_tuple[1]]
    img = './VQA/Images/train2014/COCO_train2014_' + str(id_tuple[0]).zfill(12) + '.jpg'
    train_data.append((img, question, id_tuple[-1]))

100%|██████████| 166882/166882 [00:00<00:00, 711091.67it/s]


In [4]:
train_df = pd.DataFrame(data=train_data, columns=['Image', 'Question', 'Answer'])
train_df = train_df[(train_df.Answer == 'yes') | (train_df.Answer == 'no')]
train_df.head()

Unnamed: 0,Image,Question,Answer
0,./VQA/Images/train2014/COCO_train2014_00000045...,Is this man a professional baseball player?,yes
1,./VQA/Images/train2014/COCO_train2014_00000052...,Is the dog waiting?,yes
2,./VQA/Images/train2014/COCO_train2014_00000039...,Is the sky blue?,yes
3,./VQA/Images/train2014/COCO_train2014_00000039...,Is there snow on the mountains?,yes
4,./VQA/Images/train2014/COCO_train2014_00000039...,Is the window open?,yes


In [7]:
datagen = ImageDataGenerator(rescale=1./255)

train_gen = datagen.flow_from_dataframe(
    dataframe=train_df, 
    directory='.', 
    x_col='Image', 
    y_col='Answer',  
    target_size=img_target_size,
    class_mode='binary',
    batch_size=batch_size,
    shuffle=False
)

Found 166878 validated image filenames belonging to 2 classes.


In [25]:
def vqa_data_gen(img_gen, df):
    for i, (image, answer) in enumerate(img_gen):
        questions = df[i*img_gen.batch_size:(i+1)*(img_gen.batch_size)]['Question'].to_numpy()
        print(questions.shape)
        yield (image, questions), answer
        
gen = vqa_data_gen(train_gen, train_df)
(a, b), c = next(gen)
print(a, b, c)
print(a.shape, b.shape, c.shape)

(64,)
[[[[0.7411765  0.7411765  0.7411765 ]
   [0.8196079  0.8196079  0.8196079 ]
   [0.6745098  0.6745098  0.6745098 ]
   ...
   [0.2901961  0.2901961  0.2901961 ]
   [0.18431373 0.18431373 0.18431373]
   [0.11764707 0.11764707 0.11764707]]

  [[0.22352943 0.22352943 0.22352943]
   [0.3019608  0.3019608  0.3019608 ]
   [0.32941177 0.32941177 0.32941177]
   ...
   [0.27450982 0.27450982 0.27450982]
   [0.28627452 0.28627452 0.28627452]
   [0.1254902  0.1254902  0.1254902 ]]

  [[0.32941177 0.32941177 0.32941177]
   [0.27058825 0.27058825 0.27058825]
   [0.38823533 0.38823533 0.38823533]
   ...
   [0.23529413 0.23529413 0.23529413]
   [0.16470589 0.16470589 0.16470589]
   [0.28235295 0.28235295 0.28235295]]

  ...

  [[0.59607846 0.59607846 0.59607846]
   [0.50980395 0.50980395 0.50980395]
   [0.47058827 0.47058827 0.47058827]
   ...
   [0.58431375 0.58431375 0.58431375]
   [0.627451   0.627451   0.627451  ]
   [0.54509807 0.54509807 0.54509807]]

  [[0.76470596 0.76470596 0.76470596]
 

In [15]:
img, a = next(train_gen)
print(type(img), type(a))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [23]:
print(train_df[0:5]['Question'].to_numpy().shape)

(5,)
