In [47]:
from vqa import VQA
import random
import skimage.io as io
import matplotlib.pyplot as plt
import os
import json
import numpy as np
from tqdm import tqdm

import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *
from tensorflow.keras import Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.optimizers import Adam

# Hyperparameters

In [48]:
img_size = 224 # inception default which seems to also be smaller than all the images
img_input_shape = (img_size, img_size, 3)
img_target_size = (img_size, img_size)

num_words = 1000
pad_size = 20
qstn_input_shape = (pad_size, 1)

learning_rate = 1e-3
batch_size = 64
epochs=1000

# Preprocess

In [49]:
id_tuples = []
with open('./VQA/Annotations/v2_mscoco_train2014_annotations.json') as f:
    data = json.load(f)
    for annotation in data['annotations']:
        if annotation['answer_type'] == 'yes/no':
            id_tuples.append((annotation['image_id'], annotation['question_id'],  annotation['multiple_choice_answer']))   

questions = {}
with open('./VQA/Questions/v2_OpenEnded_mscoco_train2014_questions.json') as f:
    data = json.load(f)
    for question in data['questions']:
        questions[question['question_id']] = question['question']
        
train_data = []
for id_tuple in tqdm(id_tuples):
    question = questions[id_tuple[1]]
    img = './VQA/Images/train2014/COCO_train2014_' + str(id_tuple[0]).zfill(12) + '.jpg'
    train_data.append((img, question, id_tuple[-1]))

100%|██████████████████████████████████████████████████████████████████████| 166882/166882 [00:00<00:00, 975134.98it/s]


In [50]:
train_df = pd.DataFrame(data=train_data, columns=['Image', 'Question', 'Answer'])
train_df = train_df[(train_df.Answer == 'yes') | (train_df.Answer == 'no')]
train_df.head()

Unnamed: 0,Image,Question,Answer
0,./VQA/Images/train2014/COCO_train2014_00000045...,Is this man a professional baseball player?,yes
1,./VQA/Images/train2014/COCO_train2014_00000052...,Is the dog waiting?,yes
2,./VQA/Images/train2014/COCO_train2014_00000039...,Is the sky blue?,yes
3,./VQA/Images/train2014/COCO_train2014_00000039...,Is there snow on the mountains?,yes
4,./VQA/Images/train2014/COCO_train2014_00000039...,Is the window open?,yes


In [51]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_df.Question)
train_qstn = tokenizer.texts_to_sequences(train_df.Question)
train_qstn = pad_sequences(train_qstn, maxlen=pad_size)

In [52]:
datagen = ImageDataGenerator(rescale=1./255)

train_gen = datagen.flow_from_dataframe(
    dataframe=train_df, 
    directory='.', 
    x_col='Image', 
    y_col='Answer',  
    target_size=img_target_size, 
    class_mode='binary',
    batch_size=batch_size,
    shuffle=False
)

Found 166878 validated image filenames belonging to 2 classes.


# Model

In [53]:
# Image 
vgg = VGG16(
    include_top=False,
    weights='imagenet',
    input_shape=img_input_shape,
    pooling=True,
)

for layer in vgg.layers: layer.trainable = False

img_x = vgg.output
img_x = Flatten()(img_x)
img_x = BatchNormalization()(img_x)
img_x = Dense(1024, activation='relu')(img_x)
img_x = BatchNormalization()(img_x)
img_output = Dense(1024, activation='relu')(img_x)

# Question
qstn_input = Input(shape=qstn_input_shape)
qstn_x = LSTM(64, activation='tanh')(qstn_input)
qstn_x = BatchNormalization()(qstn_x)
qstn_x = Dense(1024, activation='relu')(qstn_x)
qstn_x = BatchNormalization()(qstn_x)
qstn_output = Dense(1024, activation='relu')(qstn_x)

concat = Concatenate(axis=1)([img_output, qstn_output])
x = Dense(1024, activation='relu')(concat)
x = BatchNormalization()(x)
x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(
    inputs=[vgg.input, qstn_input], 
    outputs=output, 
    name='BiModal_VQA'
)

model.summary()

Model: "BiModal_VQA"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 224, 224, 64) 1792        input_5[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 224, 224, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 112, 112, 64) 0           block1_conv2[0][0]               
________________________________________________________________________________________

In [54]:
model.compile(
    optimizer=Adam(lr=learning_rate),
    loss='binary_crossentropy',
    metrics='accuracy'
)

In [40]:
model.fit(
    x=[train_gen, train_qstn],
    epochs=epochs,
    batch_size=batch_size
)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'numpy.ndarray'>", "<class 'tensorflow.python.keras.preprocessing.image.DataFrameIterator'>"}), <class 'NoneType'>

In [46]:
train_generator = []

for image, answer in train_gen:
    
    train_generator.append( ( (image, ), answer) )
    print(answer)
    break
    

[0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0.
 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1.
 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1.]


In [55]:
len(train_gen)

2608

In [57]:
train_qstn = train_qstn.reshape((len(train_gen), -1))
train_qstn.shape

ValueError: cannot reshape array of size 3337560 into shape (2608,newaxis)