In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import tensorflow as tf
import numpy as np

# Set the seed for random operations. 
# This let our experiments to be reproducible. 
SEED = 12
tf.random.set_seed(SEED)
# np.random.seed(SEED)

# Get current working directory
cwd = os.getcwd()

# Set GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
dataset_dir = os.path.join('/kaggle', 'input')
dataset_dir = os.path.join(dataset_dir, 'ann-and-dl-vqa')
dataset_dir = os.path.join(dataset_dir, 'dataset_vqa')

train_json = os.path.join(dataset_dir, 'train_data.json')
test_json = os.path.join(dataset_dir, 'test_data.json')

train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')
print(dataset_dir)
print(train_dir)

/kaggle/input/ann-and-dl-vqa/dataset_vqa
/kaggle/input/ann-and-dl-vqa/dataset_vqa/train


In [4]:
from __future__ import print_function
import json
import os.path
import random as ra
import tensorflow as tf
import numpy as np
import keras
from keras.optimizers import Adam
from keras import backend as K
from keras.layers import Input, Dense, Dropout, BatchNormalization, Reshape, Lambda, Embedding, LSTM, Conv2D, MaxPooling2D, TimeDistributed, RepeatVector, Concatenate
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from scipy import ndimage, misc
import imageio
from PIL import Image
from skimage.transform import rotate


Using TensorFlow backend.


In [5]:
#
# Loads & Preprocesses CLEVR dataset.
#
def load_data_generator(n, data, batch_size, vocab_size, sequence_length, tokenizer=None):
    
    while(True):
        # Dataset paths
        images_path = train_dir

        x_text = []     # List of questions
        x_image = []    # List of images
        y = []          # List of answers
        num_labels = 13  # Current number of labels, used to create index mapping
        labels = {}     # Dictionary mapping of ints to labels
        images = {}     # Dictionary of images, to minimize number of imread ops

        # Attempt to load saved JSON subset of the questions
#         print('Loading data...')

        labels = {'0': 0, '1': 1, '10': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'no': 11,'yes': 12}

        # Store image data and labels in dictionaries
#         print('Storing image data...')

        batch_paths = np.random.choice(a = data, size = batch_size)

        for q in batch_paths:
            # Create an index for each image
            if not q['image_filename'] in images:

                images[q['image_filename']] = imageio.imread(os.path.join(images_path, q['image_filename']), pilmode='RGB')

            question = q['question']
            x_text.append(question)
            #print("q[question]")
            #print(q['question'])

            image = images[q['image_filename']]
            x_image.append(image)
            #print("q[image_filename]")
            #print(images[q['image_filename']])

            label = labels[q['answer']]
            y.append(label)
            #print("q[answer]")
            #print(labels[q['answer']])

        # Convert question corpus into sequential encoding for LSTM
#         print('Processing text data...')

        if not tokenizer:
            tokenizer = Tokenizer(num_words=vocab_size)

        tokenizer.fit_on_texts(x_text)
        sequences = tokenizer.texts_to_sequences(x_text)
        x_text = sequence.pad_sequences(sequences, maxlen=sequence_length)

        # Convert x_image to np array
        x_image = np.array(x_image)

        # Convert labels to categorical labels
        y = keras.utils.to_categorical(y, num_labels) 

#         print('Text: ', x_text.shape)
#         print('Image: ', x_image.shape)
#         print('Labels: ', y.shape)

        yield ([x_text, x_image], y)


In [6]:
import math

#
# Preprocesses the input image by cropping and random rotations.
#
def process_image(x):
    target_height, target_width = 128, 128
    rotation_range = .05  # In radians
    degs = ra.uniform(-rotation_range, rotation_range)
    degs = degs * (180/ math.pi)

    x = tf.image.resize(x, (target_height, target_width), method=tf.image.ResizeMethod.AREA)
    #x = tf.contrib.image.rotate(x, degs)
    #rotate(x, degs)

    return x


In [7]:
#
# Returns relation vectors from an input convolution tensor map.
# A relation vector is the concatenation of two objects, 
#     in this case the objects are "pixels" of the tensor.
#

def get_relation_vectors(x):
	objects = []
	relations = []
	shape = K.int_shape(x)
	k = 25     # Hyperparameter which controls how many objects are considered
	keys = []

	# Get k unique random objects
	while k > 0:
		i = ra.randint(0, shape[1] - 1)
		j = ra.randint(0, shape[2] - 1)

		if not (i, j) in keys:
			keys.append((i, j))
			objects.append(x[:, i, j, :])
			k -= 1

	# Concatenate each pair of objects to form a relation vector
	for i in range(len(objects)):
		for j in range(i, len(objects)):
			relations.append(K.concatenate([objects[i], objects[j]], axis=1))

	# Restack objects into Keras tensor [batch, relation_ID, relation_vectors]
	return K.permute_dimensions(K.stack([r for r in relations], axis=0), [1, 0, 2])


In [8]:
#
# Environment Parameters
#
samples = 259492
epochs = 100
batch_size = 128
valid_batch_size = 32
learning_rate = .00025
vocab_size = 1024
sequence_length = 64
img_rows, img_cols = 320, 480
image_input_shape = (img_rows, img_cols, 3)
num_labels = 13

#
# Load & Preprocess CLEVR
#
# (x_train, y_train), num_labels, tokenizer = load_data_generator(samples, batch_size, vocab_size, sequence_length)


questions_path = train_json
with open(questions_path) as f:
    data = json.load(f)
    
data = np.array(data['questions'])
    
# print(data)
# print(len(data['questions']))

train_test_split = 0.8
train_mask = np.random.choice([True,False], samples, p=[train_test_split, 1-train_test_split])

valid_mask = np.logical_not(train_mask)

data_train = data[train_mask]
data_valid = data[valid_mask]

train_gen = load_data_generator(samples, data_train, batch_size, vocab_size, sequence_length)
valid_gen = load_data_generator(samples, data_valid, valid_batch_size, vocab_size, sequence_length)


In [9]:
#
# Define LSTM
#
text_inputs = Input(shape=(sequence_length,), name='text_input')
text_x = Embedding(vocab_size, 128)(text_inputs)
text_x = LSTM(128)(text_x)

In [10]:
#
# Define CNN
#
image_inputs = Input(shape=image_input_shape, name='image_input')
image_x = Lambda(process_image)(image_inputs)
image_x = Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu')(image_x)
image_x = BatchNormalization()(image_x)
image_x = Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu')(image_x)
image_x = BatchNormalization()(image_x)
image_x = Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu')(image_x)
image_x = BatchNormalization()(image_x)
image_x = Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu')(image_x)
image_x = BatchNormalization()(image_x)
shape = K.int_shape(image_x)

In [11]:
#
# Define Relation Network layer
#
RN_inputs = Input(shape=(1, (2 * shape[3]) + K.int_shape(text_x)[1]))
RN_x = Dense(256, activation='relu')(RN_inputs)
RN_x = Dense(256, activation='relu')(RN_x)
RN_x = Dense(256, activation='relu')(RN_x)
RN_x = Dropout(.5)(RN_x)
RN_outputs = Dense(256, activation='relu')(RN_x)
RN = Model(inputs=RN_inputs, outputs=RN_outputs)


In [12]:
#
# Implements g_theta
#
relations = Lambda(get_relation_vectors)(image_x)           # Get tensor [batch, relation_ID, relation_vectors]
question = RepeatVector(K.int_shape(relations)[1])(text_x)  # Shape question vector to same size as relations
relations = Concatenate(axis=2)([relations, question])      # Merge tensors [batch, relation_ID, relation_vectors, question_vector]
g = TimeDistributed(RN)(relations)                          # TimeDistributed applies RN to relation vectors.
g = Lambda(lambda x: K.sum(x, axis=1))(g)                   # Sum over relation_ID

#
# Define f_phi
#
f = Dense(256, activation='relu')(g)
f = Dropout(.5)(f)
f = Dense(256, activation='relu')(f)
f = Dropout(.5)(f)
outputs = Dense(num_labels, activation='softmax')(f)

In [13]:
#
# Train model
#
model = Model(inputs=[text_inputs, image_inputs], outputs=outputs)
print(model.summary())


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image_input (InputLayer)        (None, 320, 480, 3)  0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 128, 128, 3)  0           image_input[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 63, 63, 24)   672         lambda_1[0][0]                   
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 63, 63, 24)   96          conv2d_1[0][0]                   
____________________________________________________________________________________________

In [14]:
model.compile(optimizer=Adam(lr=learning_rate),
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
# from tensorflow.compat.v1 import ConfigProto
# from tensorflow.compat.v1 import InteractiveSession

# config = ConfigProto()
# config.gpu_options.allow_growth = True

model.fit_generator(train_gen,
            epochs=100, #epochs, 
            steps_per_epoch=240,
            validation_data=valid_gen,
            validation_steps=1,
            callbacks=[EarlyStopping(monitor='val_loss', patience=5)])



# model.fit(x_train, y_train, 
#           batch_size=batch_size, 
#           epochs=20, #epochs, 
#           shuffle=True,
#           callbacks=[EarlyStopping(monitor='val_loss', patience=10)])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/100
  9/240 [>.............................] - ETA: 7:36 - loss: 36.1080 - accuracy: 0.1181

In [None]:
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')
        print('Id,Category\n')

        for key, value in results.items():
            f.write(str(key) + ',' + str(value) + '\n')
            print(str(key) + ',' + str(value) + '\n')

In [None]:
def load_data_test(n, vocab_size, sequence_length):
    questions_path = test_json
    images_path = test_dir
    # vocab_size = 13
    # n = 6629        # <-- 6629 ??
    x_text = []     # List of questions
    x_image = []    # List of images
    x_id = []
    y = []          # List of answers
    num_labels = 0  # Current number of labels, used to create index mapping
    labels = {}     # Dictionary mapping of ints to labels
    images = {}     # Dictionary of images, to minimize number of imread ops

    # Attempt to load saved JSON subset of the questions
    print('Loading data...')
        
    with open(questions_path) as f:
        data = json.load(f)
    data = data['questions'][0:n]
    
    for q in data[0:n]:
        # Create an index for each image
        if not q['image_filename'] in images:
            images[q['image_filename']] = imageio.imread(os.path.join(images_path, q['image_filename']), pilmode="RGB")

        x_text.append(q['question'])
        x_image.append(images[q['image_filename']])
        x_id.append(q['question_id'])
        
    # Convert question corpus into sequential encoding for LSTM
    print('Processing text data...')
    tokenizer = Tokenizer(num_words=vocab_size)

    tokenizer.fit_on_texts(x_text)
    sequences = tokenizer.texts_to_sequences(x_text)
    x_text = sequence.pad_sequences(sequences, maxlen=64) #maxlen era a 100

    # Convert x_image to np array
    x_image = np.array(x_image)

    print('Text: ', x_text.shape)
    print('Image: ', x_image.shape)

    return [x_text, x_image, x_id], num_labels, tokenizer

In [None]:
results = {}
(texts, images, ids), _, _ = load_data_test(3000, vocab_size, sequence_length)
out_softmax = model.predict([texts, images])
prediction = tf.math.argmax(out_softmax, axis=-1)   # predicted class
print("prediction:")
print(prediction)
print(prediction[0].numpy())
print(prediction[1].numpy())
i = 0
for id in ids:
    results[id] = prediction[i].numpy()
    i = i+1
#results[ids] = prediction[0].numpy()
create_csv(results)