## Imports

In [1]:
import pandas as pd

import tensorflow as tf
print(f"GPUs: {len(tf.config.list_physical_devices('GPU'))}")

from tensorflow.keras import Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import Precision, Recall, AUC

from DataGen import VQASequence
from json_to_df import gen_df

GPUs: 1


## Data

In [2]:
train_df = gen_df('train')
val_df = gen_df('val')

## Hyperparameters

In [3]:
img_size = 224 # inception default which seems to also be smaller than all the images
img_input_shape = (img_size, img_size, 3)
qstn_input_shape = (None, 300)

learning_rate = 1e-2
batch_size = 512
epochs=100

In [None]:
train_gen = VQASequence(batch_size, train_df, img_size)
val_gen = VQASequence(batch_size, val_df, img_size)

## Model

In [None]:
# Image 
vgg = VGG16(
    include_top=False,
    weights='imagenet',
    input_shape=img_input_shape,
    pooling='max'
)

for layer in vgg.layers: layer.trainable = False

img_x = vgg.output
img_x = Flatten()(img_x)
img_x = Dense(1024, activation='relu')(img_x)
img_x = BatchNormalization()(img_x)
img_output = Dense(1024, activation='relu')(img_x)

# Question
qstn_input = Input(shape=qstn_input_shape, dtype="float32")
qstn_x = LSTM(2048, activation='tanh')(qstn_input)
qstn_x = BatchNormalization()(qstn_x)
qstn_x = Dense(1024, activation='relu')(qstn_x)
qstn_x = BatchNormalization()(qstn_x)
qstn_output = Dense(1024, activation='relu')(qstn_x)

concat = Concatenate(axis=1)([img_output, qstn_output])
x = Dense(1024, activation='relu')(concat)
x = BatchNormalization()(x)
x = Dense(1024, activation='relu')(x)
x = BatchNormalization()(x)
x = Dense(1024, activation='relu')(x)
x = BatchNormalization()(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(
    inputs=[vgg.input, qstn_input], 
    outputs=output, 
    name='BiModal_VQA'
)

model.summary()

In [None]:
model.compile(
    optimizer=Adam(lr=learning_rate),
    loss='binary_crossentropy',
    metrics=['accuracy', AUC(), Precision(), Recall()]
)

rp = ReduceLROnPlateau(patience=2)
es = EarlyStopping(patience=4)

callbacks= [rp, es]

In [None]:
%%time

hist = model.fit(
    x=train_gen,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=val_gen,
    callbacks=callbacks
)