# [NIPS Implementation Challenge](https://nurture.ai/nips-challenge/p/089570b9-fe63-43af-8a25-76117d2a1c21)

Implementation of [A simple neural network module
for relational reasoning](https://papers.nips.cc/paper/7082-a-simple-neural-network-module-for-relational-reasoning.pdf) by [Vikram Voleti](https://voletiv.github.io).

In [1]:
import json
import numpy as np
import string
import tqdm

import keras.backend as K
from keras.layers import Layer, Input, Reshape, Embedding, LSTM, Dense, Lambda, Dropout
from keras.models import Model
from keras.optimizers import Adam

# from pprint import pprint

Using TensorFlow backend.


In [2]:
# KERAS OPTIONS
K.set_image_data_format = "channels_last"

In [3]:
# FILE NAMES
# Scenes
state_description_train_file = '/home/voletiv/Datasets/CLEVR_v1.0/scenes/CLEVR_train_scenes.json'
state_description_val_file = '/home/voletiv/Datasets/CLEVR_v1.0/scenes/CLEVR_val_scenes.json'
# Questions
questions_train_file = '/home/voletiv/Datasets/CLEVR_v1.0/questions/CLEVR_train_questions.json'
questions_val_file = '/home/voletiv/Datasets/CLEVR_v1.0/questions/CLEVR_val_questions.json'
questions_test_file = '/home/voletiv/Datasets/CLEVR_v1.0/questions/CLEVR_test_questions.json'

In [4]:
# RELATION NETWORK PARAMS
WORD_EMBEDDING_DIM = 32
LSTM_UNITS = 256
G_FC1, G_FC2, G_FC3, G_FC4 = 512, 512, 512, 512
F_FC1, F_FC2, F_DROPOUT2, F_FC3 = 512, 1024, 0.02, 29
batch_size = 64
epochs = 100
optimizer = Adam(lr=1e-4)

# 1 Dealing with questions and answers

## 1.1 Reading questions from json files

In [5]:
# Read .json files
questions_train = json.load(open(questions_train_file))
questions_val = json.load(open(questions_val_file))
questions_test = json.load(open(questions_test_file))

In [6]:
questions_train["questions"][0]

{'answer': 'yes',
 'image_filename': 'CLEVR_train_000000.png',
 'image_index': 0,
 'program': [{'function': 'scene', 'inputs': [], 'value_inputs': []},
  {'function': 'filter_size', 'inputs': [0], 'value_inputs': ['large']},
  {'function': 'filter_color', 'inputs': [1], 'value_inputs': ['green']},
  {'function': 'count', 'inputs': [2], 'value_inputs': []},
  {'function': 'scene', 'inputs': [], 'value_inputs': []},
  {'function': 'filter_size', 'inputs': [4], 'value_inputs': ['large']},
  {'function': 'filter_color', 'inputs': [5], 'value_inputs': ['purple']},
  {'function': 'filter_material', 'inputs': [6], 'value_inputs': ['metal']},
  {'function': 'filter_shape', 'inputs': [7], 'value_inputs': ['cube']},
  {'function': 'count', 'inputs': [8], 'value_inputs': []},
  {'function': 'greater_than', 'inputs': [3, 9], 'value_inputs': []}],
 'question': 'Are there more big green things than large purple shiny cubes?',
 'question_family_index': 2,
 'question_index': 0,
 'split': 'train'}

In [7]:
questions_train["questions"][1]

{'answer': '2',
 'image_filename': 'CLEVR_train_000000.png',
 'image_index': 0,
 'program': [{'function': 'scene', 'inputs': [], 'value_inputs': []},
  {'function': 'filter_size', 'inputs': [0], 'value_inputs': ['small']},
  {'function': 'filter_color', 'inputs': [1], 'value_inputs': ['cyan']},
  {'function': 'filter_material', 'inputs': [2], 'value_inputs': ['rubber']},
  {'function': 'unique', 'inputs': [3], 'value_inputs': []},
  {'function': 'same_shape', 'inputs': [4], 'value_inputs': []},
  {'function': 'count', 'inputs': [5], 'value_inputs': []}],
 'question': 'How many other things are there of the same shape as the tiny cyan matte object?',
 'question_family_index': 43,
 'question_index': 1,
 'split': 'train'}

## 1.2 Finding maximum length of question

In [8]:
# Max question length
max_question_length = 0

for q in tqdm.tqdm(questions_train["questions"]):
    curr_question_length = len(q['question'].split())
    if curr_question_length > max_question_length:
        max_question_length = curr_question_length

100%|██████████| 699989/699989 [00:01<00:00, 570355.97it/s]


In [9]:
max_question_length

43

In [10]:
# Check max question length in val
val_max_question_length = 0
for q in tqdm.tqdm(questions_val["questions"]):
    curr_question_length = len(q['question'].split())
    if curr_question_length > val_max_question_length:
        val_max_question_length = curr_question_length
print(val_max_question_length)

100%|██████████| 149991/149991 [00:00<00:00, 491482.06it/s]

43





In [11]:
# Check max question length in test
test_max_question_length = 0
for q in tqdm.tqdm(questions_test["questions"]):
    curr_question_length = len(q['question'].split())
    if curr_question_length > test_max_question_length:
        test_max_question_length = curr_question_length
print(test_max_question_length)

100%|██████████| 149988/149988 [00:00<00:00, 554443.35it/s]

42





## 1.3 Finding question vocabulary

In [12]:
# To remove punctuation
table = str.maketrans('', '', string.punctuation)

In [13]:
# Question corpus
question_corpus = []

for q in tqdm.tqdm(questions_train["questions"]):
    question_corpus += [w.translate(table) for w in q['question'].lower().split()]

100%|██████████| 699989/699989 [00:07<00:00, 88393.57it/s]


In [14]:
len(question_corpus)

12867378

In [15]:
# Question vocabulary
question_vocabulary = sorted(set(question_corpus))

In [16]:
len(question_vocabulary)

80

In [17]:
question_vocabulary

['a',
 'an',
 'and',
 'another',
 'any',
 'anything',
 'are',
 'as',
 'ball',
 'balls',
 'behind',
 'big',
 'block',
 'blocks',
 'blue',
 'both',
 'brown',
 'color',
 'cube',
 'cubes',
 'cyan',
 'cylinder',
 'cylinders',
 'do',
 'does',
 'either',
 'else',
 'equal',
 'fewer',
 'front',
 'gray',
 'greater',
 'green',
 'has',
 'have',
 'how',
 'in',
 'is',
 'it',
 'its',
 'large',
 'left',
 'less',
 'made',
 'many',
 'material',
 'matte',
 'metal',
 'metallic',
 'more',
 'number',
 'object',
 'objects',
 'of',
 'on',
 'or',
 'other',
 'purple',
 'red',
 'right',
 'rubber',
 'same',
 'shape',
 'shiny',
 'side',
 'size',
 'small',
 'sphere',
 'spheres',
 'than',
 'that',
 'the',
 'there',
 'thing',
 'things',
 'tiny',
 'to',
 'visible',
 'what',
 'yellow']

In [18]:
# Checking question vocabulary with val
equal_vocabs = True
val_question_corpus = []
for q in tqdm.tqdm(questions_val["questions"]):
    val_question_corpus += [w.translate(table) for w in q['question'].lower().split()]
val_question_vocabulary = sorted(set(val_question_corpus))
if len(val_question_vocabulary) != len(question_vocabulary):
    print("Lengths not same! Vocabs are not equal!")
    equal_vocabs = False
if equal_vocabs:
    for i in range(len(val_question_vocabulary)):
        if val_question_vocabulary[i] != question_vocabulary[i]:
            print("Vocabs not equal at:", i, question_vocabulary[i], val_question_vocabulary[i])
            equal_vocabs = False
            break
if equal_vocabs:
    print("Vocabs are equal!")

100%|██████████| 149991/149991 [00:01<00:00, 88512.85it/s]

Vocabs are equal!





In [19]:
# Checking question vocabulary with test (just in case of OOV)
equal_vocabs = True
test_question_corpus = []
for q in tqdm.tqdm(questions_test["questions"]):
    test_question_corpus += [w.translate(table) for w in q['question'].lower().split()]
test_question_vocabulary = sorted(set(test_question_corpus))
if len(test_question_vocabulary) != len(question_vocabulary):
    print("Lengths not same! Vocabs are not equal!")
    equal_vocabs = False
if equal_vocabs:
    for i in range(len(test_question_vocabulary)):
        if test_question_vocabulary[i] != question_vocabulary[i]:
            print("Vocabs not equal at:", i, question_vocabulary[i], test_question_vocabulary[i])
            equal_vocabs = False
            break
if equal_vocabs:
    print("Vocabs are equal!")

100%|██████████| 149988/149988 [00:01<00:00, 88310.32it/s]

Vocabs are equal!





## 1.4 Converting questions to lists of word indices

We'll make list of word indices in reverse order of words,
as is usual practice for inputs to LSTMs, i.e.
If question is "what is this blue thing?", the word indices being 1, 2, 3, 4, 5 respectively,
and *max_q_len* is 7, then the word_indices list shall be [0, 0, 5, 4, 3, 2, 1]

In [20]:
# Train
questions_word_indices_train = np.zeros((len(questions_train["questions"]), max_question_length))
for iq, q in enumerate(tqdm.tqdm(questions_train["questions"])):
    for iw, w in enumerate(q['question'].lower().split()):
        questions_word_indices_train[iq][-iw-1] = question_vocabulary.index(w.translate(table)) + 1

100%|██████████| 699989/699989 [00:26<00:00, 26436.38it/s]


In [21]:
# Val
questions_word_indices_val = np.zeros((len(questions_val["questions"]), max_question_length))
for iq, q in enumerate(tqdm.tqdm(questions_val["questions"])):
    for iw, w in enumerate(q['question'].lower().split()):
        questions_word_indices_val[iq][-iw-1] = question_vocabulary.index(w.translate(table)) + 1

100%|██████████| 149991/149991 [00:05<00:00, 26114.02it/s]


In [22]:
# Test
questions_word_indices_test = np.zeros((len(questions_test["questions"]), max_question_length))
for iq, q in enumerate(tqdm.tqdm(questions_test["questions"])):
    for iw, w in enumerate(q['question'].lower().split()):
        questions_word_indices_test[iq][-iw-1] = question_vocabulary.index(w.translate(table)) + 1

100%|██████████| 149988/149988 [00:05<00:00, 26546.25it/s]


In [23]:
q['question']

'There is a yellow cube that is behind the large metal object; is its size the same as the block on the right side of the small gray shiny object?'

In [24]:
questions_word_indices_test[-1]

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,  52.,  64.,  31.,  67.,  72.,  54.,  65.,  60.,  72.,
        55.,  13.,  72.,   8.,  62.,  72.,  66.,  40.,  38.,  52.,  48.,
        41.,  72.,  11.,  38.,  71.,  19.,  80.,   1.,  38.,  73.])

## 1.5 Image indices for each question

In [25]:
questions_image_indices_train = np.zeros((len(questions_train["questions"])), dtype=int)
for iq, q in enumerate(tqdm.tqdm(questions_train["questions"])):
    questions_image_indices_train[iq] = q['image_index']

100%|██████████| 699989/699989 [00:00<00:00, 1252530.01it/s]


In [26]:
questions_image_indices_val = np.zeros((len(questions_val["questions"])), dtype=int)
for iq, q in enumerate(tqdm.tqdm(questions_val["questions"])):
    questions_image_indices_val[iq] = q['image_index']

100%|██████████| 149991/149991 [00:00<00:00, 1216673.60it/s]


In [27]:
questions_image_indices_test = np.zeros((len(questions_test["questions"])), dtype=int)
for iq, q in enumerate(tqdm.tqdm(questions_test["questions"])):
    questions_image_indices_test[iq] = q['image_index']

100%|██████████| 149988/149988 [00:00<00:00, 1155775.00it/s]


## 1.6 Finding answer vocabulary

In [28]:
# Answers corpus
answers_corpus = []

for q in tqdm.tqdm(questions_train["questions"]):
    answers_corpus.append(q['answer'].lower())

100%|██████████| 699989/699989 [00:00<00:00, 1202112.18it/s]


In [29]:
# Answer vocabulary
answers_vocabulary = sorted(set(answers_corpus))

In [30]:
len(answers_vocabulary)

28

In [31]:
answers_vocabulary

['0',
 '1',
 '10',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'blue',
 'brown',
 'cube',
 'cyan',
 'cylinder',
 'gray',
 'green',
 'large',
 'metal',
 'no',
 'purple',
 'red',
 'rubber',
 'small',
 'sphere',
 'yellow',
 'yes']

In [32]:
# Check answer_vocabulary in val
equal_vocabs = True
val_answers_corpus = []
for q in tqdm.tqdm(questions_val["questions"]):
    val_answers_corpus.append(q['answer'].lower())
# Vocab
val_answers_vocabulary = sorted(set(val_answers_corpus))
print(len(val_answers_vocabulary))
if len(val_answers_vocabulary) != len(answers_vocabulary):
    print("Lengths not same! Vocabs are not equal!")
    equal_vocabs = False
if equal_vocabs:
    for i in range(len(val_answers_vocabulary)):
        if val_answers_vocabulary[i] != answers_vocabulary[i]:
            print("Vocabs not equal at:", i, answers_vocabulary[i], val_answers_vocabulary[i])
            equal_vocabs = False
            break
if equal_vocabs:
    print("Vocabs are equal!")

100%|██████████| 149991/149991 [00:00<00:00, 1097413.85it/s]

28
Vocabs are equal!





## 1.7 Finding answers

In [33]:
# Train
answers_train = []
for iq, q in enumerate(tqdm.tqdm(questions_train["questions"])):
    answers_train.append(q['answer'])

100%|██████████| 699989/699989 [00:00<00:00, 1249439.50it/s]


In [34]:
# Val
answers_val = []
for iq, q in enumerate(tqdm.tqdm(questions_val["questions"])):
    answers_val.append(q['answer'])

100%|██████████| 149991/149991 [00:00<00:00, 1179115.89it/s]


## 1.8 Converting answers to one-hot encoded vectors

In [35]:
# Train
one_hot_answers_train = np.zeros((len(answers_train), len(answers_vocabulary)))
for i, answer in enumerate(tqdm.tqdm(answers_train)):
    one_hot_answers_train[i][answers_vocabulary.index(answer)] = 1

100%|██████████| 699989/699989 [00:00<00:00, 776030.60it/s]


In [36]:
# Val
one_hot_answers_val = np.zeros((len(answers_val), len(answers_vocabulary)))
for i, answer in enumerate(tqdm.tqdm(answers_val)):
    one_hot_answers_val[i][answers_vocabulary.index(answer)] = 1

100%|██████████| 149991/149991 [00:00<00:00, 730350.50it/s]


## 1.8 Saving/loading

In [37]:
# Save
np.savez("question_answer_stuff",
         max_question_length=max_question_length,
         questions_image_indices_train=questions_image_indices_train,
         questions_word_indices_train=questions_word_indices_train,
         questions_image_indices_val=questions_image_indices_val,
         questions_word_indices_val=questions_word_indices_val,
         questions_image_indices_test=questions_image_indices_test,
         questions_word_indices_test=questions_word_indices_test,
         question_vocabulary=question_vocabulary,
         answers_vocabulary=answers_vocabulary,
         answers_train=answers_train,
         answers_val=answers_val,
         one_hot_answers_train=one_hot_answers_train,
         one_hot_answers_val=one_hot_answers_val)

In [38]:
# Load
question_answer_stuff = np.load("question_answer_stuff.npz")
max_question_length = question_answer_stuff["max_question_length"]
questions_image_indices_train = question_answer_stuff["questions_image_indices_train"]
questions_word_indices_train = question_answer_stuff["questions_word_indices_train"]
questions_image_indices_val = question_answer_stuff["questions_image_indices_val"]
questions_word_indices_val = question_answer_stuff["questions_word_indices_val"]
questions_image_indices_test = question_answer_stuff["questions_image_indices_test"]
questions_word_indices_test = question_answer_stuff["questions_word_indices_test"]
answers_vocabulary = question_answer_stuff["answers_vocabulary"]
one_hot_answers_train = question_answer_stuff["one_hot_answers_train"]
one_hot_answers_val = question_answer_stuff["one_hot_answers_val"]

# 2 Dealing with state descriptions

## 2.1 Reading the json file containing descriptions

In [39]:
# Read .json files
state_description_train = json.load(open(state_description_train_file))
state_description_val = json.load(open(state_description_val_file))

In [40]:
state_description_train["scenes"][0]

{'directions': {'above': [0.0, 0.0, 1.0],
  'behind': [-0.754490315914154, 0.6563112735748291, 0.0],
  'below': [-0.0, -0.0, -1.0],
  'front': [0.754490315914154, -0.6563112735748291, -0.0],
  'left': [-0.6563112735748291, -0.7544902563095093, 0.0],
  'right': [0.6563112735748291, 0.7544902563095093, -0.0]},
 'image_filename': 'CLEVR_train_000000.png',
 'image_index': 0,
 'objects': [{'3d_coords': [-1.3705521821975708,
    2.0794010162353516,
    0.699999988079071],
   'color': 'blue',
   'material': 'rubber',
   'pixel_coords': [269, 88, 12.661545753479004],
   'rotation': 269.8517172617167,
   'shape': 'cube',
   'size': 'large'},
  {'3d_coords': [-2.9289753437042236, -1.7488206624984741, 0.699999988079071],
   'color': 'green',
   'material': 'metal',
   'pixel_coords': [93, 108, 11.522202491760254],
   'rotation': 292.2219458666971,
   'shape': 'cylinder',
   'size': 'large'},
  {'3d_coords': [1.5515961647033691, 0.6776641607284546, 0.3499999940395355],
   'color': 'cyan',
   'mate

## 2.2. Count the maximum number of objects in each image

In [41]:
# Count the max number of objects in an image
max_number_of_objects_in_scene = 0

for i in tqdm.tqdm(range(len(state_description_train['scenes']))):
    curr_number_of_objects = len(state_description_train['scenes'][i]['objects'])
    if curr_number_of_objects > max_number_of_objects_in_scene:
        max_number_of_objects_in_scene = curr_number_of_objects

100%|██████████| 70000/70000 [00:00<00:00, 968115.25it/s]


In [42]:
max_number_of_objects_in_scene

10

## 2.3. Make the *state_description_matrix*

In [43]:
# Functions

def my_color(color_text):
    if color_text == 'gray':
        return [.5, .5, .5]
    elif color_text == 'blue':
        return [0., 0., 1.]
    elif color_text == 'brown':
        return [165/255., 42/255., 42/255.]
    elif color_text == 'yellow':
        return [1., 1., 0.]
    elif color_text == 'red':
        return [1., 0., 0.]
    elif color_text == 'green':
        return [0., 1., 0.]
    elif color_text == 'purple':
        return [.5, 0., .5]
    elif color_text == 'cyan':
        return [0., 1., 1.]
    
def my_shape_index(shape):
    if shape == 'cube':
        return 0
    elif shape == 'sphere':
        return 1
    elif shape == 'cylinder':
        return 2
    
def my_material_index(material):
    if material == 'rubber':
        return 0
    elif material == 'metal':
        return 1
    
def my_size_index(size):
    if size == 'small':
        return 0
    elif size == 'large':
        return 1

In [44]:
def make_state_description_matrix(state_description, max_number_of_objects_in_scene=10):
    
    # Define state description matrix
    # features – 3D coordinates (x, y, z);
    #            color (r, g, b) {"gray", "blue", "brown", "yellow", "red", "green", "purple", or "cyan"};
    #            shape ("cube", "sphere", or "cylinder");
    #            material ("rubber" or "metal");
    #            size ("small" or "large")
    # state_description_matrix === (n, max_number_of_objects_in_scene, objects_features_dim)
    state_description_matrix = np.zeros((len(state_description['scenes']), max_number_of_objects_in_scene, 13))

    # Make state description matrix
    for i in tqdm.tqdm(range(len(state_description['scenes']))):
        for o in range(len(state_description['scenes'][i]['objects'])):
            state_description_matrix[i][o][0:3] = state_description['scenes'][i]['objects'][o]['3d_coords']
            state_description_matrix[i][o][3:6] = my_color(state_description['scenes'][i]['objects'][o]['color'])
            state_description_matrix[i][o][6 + my_shape_index(state_description['scenes'][i]['objects'][o]['shape'])] = 1
            state_description_matrix[i][o][9 + my_material_index(state_description['scenes'][i]['objects'][o]['material'])] = 1
            state_description_matrix[i][o][11 + my_size_index(state_description['scenes'][i]['objects'][o]['size'])] = 1
    
    return state_description_matrix

In [45]:
# Train
state_description_matrix_train = make_state_description_matrix(state_description_train, max_number_of_objects_in_scene)

100%|██████████| 70000/70000 [00:02<00:00, 24569.53it/s]


In [46]:
# Find number of dimensions in object
object_features_dim = state_description_matrix_train.shape[-1]

In [47]:
# Val
state_description_matrix_val = make_state_description_matrix(state_description_val, max_number_of_objects_in_scene)

100%|██████████| 15000/15000 [00:00<00:00, 24408.48it/s]


In [48]:
state_description_matrix_train.shape

(70000, 10, 13)

In [49]:
state_description_matrix_train[0]

array([[-1.37055218,  2.07940102,  0.69999999,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [-2.92897534, -1.74882066,  0.69999999,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [ 1.55159616,  0.67766416,  0.34999999,  0.        ,  1.        ,
         1.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ],
       [-0.25301406, -2.30893254,  0.69999999,  0.64705882,  0.16470588,
         0.16470588,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [ 1.01889408, -1.93693209,  0.34999999,  0.5       ,  0.5       ,
         0.5       ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.43993688,  2.998752

In [50]:
# Range of 3d coords
x_min = np.min(state_description_matrix_train[:, :, 0])
x_max = np.max(state_description_matrix_train[:, :, 0])
y_min = np.min(state_description_matrix_train[:, :, 1])
y_max = np.max(state_description_matrix_train[:, :, 1])
z_min = np.min(state_description_matrix_train[:, :, 2])
z_max = np.max(state_description_matrix_train[:, :, 2])
print("x: ", x_min, "to", x_max)
print("y: ", y_min, "to", y_max)
print("z: ", z_min, "to", z_max)

x:  -2.99998259544 to 2.99999952316
y:  -2.99999976158 to 2.99997830391
z:  0.0 to 0.699999988079


In [51]:
# Normalizing 3d coords in train (in cases where objects are present)
state_description_matrix_train[:, :, 0] = (state_description_matrix_train[:, :, 0] - x_min)/(x_max - x_min)*(state_description_matrix_train[:, :, 0] != 0)
state_description_matrix_train[:, :, 1] = (state_description_matrix_train[:, :, 1] - y_min)/(y_max - y_min)*(state_description_matrix_train[:, :, 1] != 0)
state_description_matrix_train[:, :, 2] = (state_description_matrix_train[:, :, 2] - z_min)/(z_max - z_min)*(state_description_matrix_train[:, :, 2] != 0)

In [52]:
# Normalizing 3d coords in val (in cases where objects are present)
state_description_matrix_val[:, :, 0] = (state_description_matrix_val[:, :, 0] - x_min)/(x_max - x_min)*(state_description_matrix_val[:, :, 0] != 0)
state_description_matrix_val[:, :, 1] = (state_description_matrix_val[:, :, 1] - y_min)/(y_max - y_min)*(state_description_matrix_val[:, :, 1] != 0)
state_description_matrix_val[:, :, 2] = (state_description_matrix_val[:, :, 2] - z_min)/(z_max - z_min)*(state_description_matrix_val[:, :, 2] != 0)

In [53]:
state_description_matrix_train[0]

array([[ 0.27157254,  0.84656989,  1.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.01183458,  0.20853061,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [ 0.75859872,  0.61294623,  0.5       ,  0.        ,  1.        ,
         1.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.45782945,  0.11517829,  1.        ,  0.64705882,  0.16470588,
         0.16470588,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ],
       [ 0.66981477,  0.17717859,  0.5       ,  0.5       ,  0.5       ,
         0.5       ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.57332162,  0.999795

## 2.4 Saving/loading

In [54]:
# Save
np.savez("state_description_matrix",
         max_number_of_objects_in_scene=max_number_of_objects_in_scene,
         object_features_dim=object_features_dim,
         state_description_matrix_train=state_description_matrix_train,
         state_description_matrix_val=state_description_matrix_val)

In [55]:
# Load
state_description_matrix = np.load("state_description_matrix.npz")
max_number_of_objects_in_scene = state_description_matrix["max_number_of_objects_in_scene"]
object_features_dim = state_description_matrix["object_features_dim"]
state_description_matrix_train = state_description_matrix["state_description_matrix_train"]
state_description_matrix_val = state_description_matrix["state_description_matrix_val"]

# 3 Make training batches

Each training sample given to the Relation Network is a (scene, question) pair, the scene being image or state_description.

The *scene* or *state_description_matrix* is an *(S x max_number_of_objects_in_scene x object_features_dim)* array, while the questions matrix is a *(Q x max_question_length)* array.

The training batch needs to be *[scenes_input, questions_input]*, where *scenes_input* is an *(n x max_number_of_objects_in_scene x object_features_dim)* array, and *questions_input* is an *(n x max_question_length)* array. Each row consists of a *question* and the *scene* (state_description) of the image corresponding to the question.

**rn_input** = [**scenes_input**, **questions_input**], **rn_output** = [**one_hot_answers**]

**scenes_input** === *(n, max_number_of_objects_in_scene, object_features_dim)*

**questions_input** === *(n, max_question_length)*

**one_hot_answers** === *(n, answer_vocabulary_length)*

In [56]:
# Train samples
# rn_input = [scenes_input_train, questions_input_train]
# scenes_input === n x max_number_of_objects_in_scene x object_features_dim
# questions_input === n x max_question_length
questions_input_train = questions_word_indices_train
scenes_input_train = np.zeros((len(questions_input_train),
                               state_description_matrix_train.shape[1],
                               state_description_matrix_train.shape[2]))
for i, image_index in enumerate(tqdm.tqdm(questions_image_indices_train)):
    scenes_input_train[i] = state_description_matrix_train[image_index]

100%|██████████| 699989/699989 [00:01<00:00, 613537.53it/s]


In [57]:
# Val samples
questions_input_val = questions_word_indices_val
scenes_input_val = np.zeros((len(questions_input_val),
                               state_description_matrix_val.shape[1],
                               state_description_matrix_val.shape[2]))
for i, image_index in enumerate(tqdm.tqdm(questions_image_indices_val)):
    scenes_input_val[i] = state_description_matrix_val[image_index]

100%|██████████| 149991/149991 [00:00<00:00, 535444.20it/s]


In [58]:
# Test samples
questions_input_test = questions_word_indices_test
# scenes_input_test = np.zeros((len(questions_input_test),
#                                state_description_matrix_test.shape[1],
#                                state_description_matrix_test.shape[2]))
# for i, image_index in enumerate(tqdm.tqdm(questions_image_indices_test)):
#     scenes_input_test[i] = state_description_matrix_test[image_index]

## 3.1 Example val batch

In [59]:
# Example validation batch of 50 samples
example_val_idx = np.random.choice(len(questions_input_val), 50)
questions_input_val_example_batch = questions_input_val[example_val_idx]
scenes_input_val_example_batch = scenes_input_val[example_val_idx]
one_hot_answers_val_example_batch = one_hot_answers_val[example_val_idx]

# 4 Making the Relation Network

We shall first look at individual functions to understand the process.

In [60]:
def relation_network(max_number_of_objects_in_scene=10, object_features_dim=13, max_question_length=43,
                     WORD_EMBEDDING_DIM=32, LSTM_UNITS=256,
                     G_FC1=512, G_FC2=512, G_FC3=512, G_FC4=512,
                     F_FC1=512, F_FC2=1024, F_DROPOUT2=0.02, answers_vocabulary_length=29):
    
    # scenes_input === (n, max_number_of_objects_in_scene, object_features_dim)
    # questions_input === (n, max_question_length)
    
    # Inputs
    scenes_input_tensor = Input(shape=(max_number_of_objects_in_scene, object_features_dim,))
    questions_input_tensor = Input(shape=(max_question_length,))
    
    # Process input for g_theta from [scenes_input, questions_input],
    # by making all object pairs for each scene and concatenating question in each pair
    # g_input === (n,
    #              max_number_of_objects_in_scene*max_number_of_objects_in_scene,
    #              2*object_features_dim+question_features_dim)
    g_input = process_scenes_and_questions(max_number_of_objects_in_scene=max_number_of_objects_in_scene,
                                           object_features_dim=object_features_dim,
                                           max_question_length=max_question_length,
                                           WORD_EMBEDDING_DIM=WORD_EMBEDDING_DIM,
                                           LSTM_UNITS=LSTM_UNITS)([scenes_input_tensor, questions_input_tensor])
    
    # G
    # Run each g_input through g_model to make g_output
    # g_model runs g_theta on each object_pair+question in each sample
    # g_output === (n, G_FC4)
    g_output = g_model(max_number_of_objects_in_scene=max_number_of_objects_in_scene,
                       object_features_dim=object_features_dim,
                       question_features_dim=LSTM_UNITS,
                       G_FC1=G_FC1, G_FC2=G_FC2, G_FC3=G_FC3, G_FC4=G_FC4)(g_input)
    
    # F
    # f_output === (n, answer_vocabulary_length)
    f_output = f_phi(input_dim=G_FC4,
                     F_FC1=F_FC1, F_FC2=F_FC2, F_DROPOUT2=F_DROPOUT2, F_FC3=answers_vocabulary_length)(g_output)
    
    relation_network = Model(inputs=[scenes_input_tensor, questions_input_tensor], outputs=[f_output])
    
    return relation_network

In the following sections, I shall create the individual models.

## 4.1 *process_scenes_and_questions*

1) Convert the word_indices in each question to an embedding and pass through an LSTM to make question_features

2) Make all object pairs in each scene and concatenate with question_features

Every object_pair+question_features is the input to g_theta.

In [61]:
def process_scenes_and_questions(max_number_of_objects_in_scene=10,
                                 object_features_dim=13,
                                 max_question_length=43,
                                 WORD_EMBEDDING_DIM=32,
                                 LSTM_UNITS=256):
    '''
    scenes_input === n x max_number_of_objects_in_scene x object_features_dim
    '''
    # Inputs to model
    scenes_input = Input(shape=(max_number_of_objects_in_scene, object_features_dim,))
    questions_input = Input(shape=(max_question_length,))

    # Make question_features using Embedding+LSTM
    question_embeddings = Embedding(max_question_length, WORD_EMBEDDING_DIM, mask_zero=True)(questions_input)
    question_features = LSTM(LSTM_UNITS)(question_embeddings)

    # Make all object pairs and concatenate question_features to each pair 
    g_input = MakeGInput()([scenes_input, question_features])
    
    return Model(inputs=[scenes_input, questions_input], outputs=[g_input])

Since MakeGInput requires a lot of tensor operations, I made it a Keras Layer:

In [62]:
class MakeGInput(Layer):
    def __init__(self):
        super(MakeGInput, self).__init__()

    def build(self, input_shape):
        self.shape = input_shape
        super(MakeGInput, self).build(input_shape)

    def call(self, inputs, **kwargs):
        '''
        inputs[0] = scenes_input === (n, max_number_of_objects_in_scene, object_features_dim)
        inputs[1] = question_features === (n, questions_LSTM_dim)
        '''
        scenes_input = inputs[0]
        question_features = inputs[1]
        
        # Calc
        max_number_of_objects_in_scene = scenes_input.shape[1]
        
        # MAKE ALL OBJECT PAIRS, AND CONCATENATE QUESTION
        
        # Arrange one of the pair
        scenes_input_i = K.expand_dims(scenes_input, axis=1)
        
        # Repeat this pair in axis 1
        scenes_input_i1 = K.repeat_elements(scenes_input_i, rep=max_number_of_objects_in_scene, axis=1)

        # Reshape the second of the pair
        scenes_input_j = K.expand_dims(scenes_input, axis=2)

        # Arrange question_features to concatenate
        question_features1 = K.expand_dims(question_features, axis=1)
        question_features2 = K.repeat_elements(question_features1, rep=max_number_of_objects_in_scene, axis=1)
        question_features3 = K.expand_dims(question_features2, axis=2)

        # Concatenate question_features to second pair
        scenes_input_j_and_q = K.concatenate([scenes_input_j, question_features3], axis=-1)

        # Repeat second of the pair i axis 2
        scenes_input_j_and_q1 = K.repeat_elements(scenes_input_j_and_q, rep=max_number_of_objects_in_scene, axis=2)

        # Concatenate all
        g_input0 = K.concatenate([scenes_input_i1, scenes_input_j_and_q1], axis=-1)
        
        # Reshape to have all object pairs, i.e. 10*10=100 elements per scene-question sample
        g_input = K.reshape(g_input0,
                            (-1,
                             max_number_of_objects_in_scene*max_number_of_objects_in_scene,
                             g_input0.shape[-1].value))

        return g_input

## 4.2 *g_model*

*g_model* runs *g_theta* on each object_pair+question_feature in each scene+question sample, and sums up the g_theta_outputs of each scene+question sample.

In [63]:
def g_model(max_number_of_objects_in_scene=10, object_features_dim=13, question_features_dim=256,
            G_FC1=512, G_FC2=512, G_FC3=512, G_FC4=512):
    
    number_of_object_pairs = max_number_of_objects_in_scene * max_number_of_objects_in_scene
    features_dim = 2 * object_features_dim+question_features_dim
    
    g_input = Input(shape=(number_of_object_pairs, features_dim))
    
    # Convert g_input === (n, number_of_object_pairs, features_dim) into
    # g_theta_input === (n*number_of_object_pairs, features_dim) so that
    # g_theta can be applied on every object pair in every question+scene sample
    g_theta_input = Lambda(condense_batch_size_and_number_of_object_pairs)(g_input)
    
    # Apply g_theta
    g_theta_output = g_theta(features_dim=features_dim,
                             G_FC1=G_FC1, G_FC2=G_FC2, G_FC3=G_FC3, G_FC4=G_FC4)(g_theta_input)
    
    # Convert g_theta_output === (n*number_of_object_pairs, G_FC4) into
    # g_theta_outputs === (n, number_of_object_pairs, G_FC4) so that the
    # the g_theta outputs of all object pairs (in each question+scene sample) can be summed
    g_theta_outputs = Lambda(expand_batch_size_and_number_of_object_pairs,
                             arguments={'number_of_object_pairs': number_of_object_pairs})(g_theta_output)
    
    # Sum the g_theta outputs of all object pairs (in each question+scene sample)
    g_theta_outputs_sum = Lambda(sum_g_theta_outputs)(g_theta_outputs)
    
    return Model(inputs=[g_input], outputs=[g_theta_outputs_sum])

g_theta has 4 Dense layers with relu.

In [64]:
def g_theta(features_dim=282, G_FC1=512, G_FC2=512, G_FC3=512, G_FC4=512):
    g_theta_input = Input(shape=(features_dim,))
    x = Dense(G_FC1, activation='relu')(g_theta_input)
    x = Dense(G_FC2, activation='relu')(x)
    x = Dense(G_FC3, activation='relu')(x)
    g_theta_output = Dense(G_FC4, activation='relu')(x)
    return Model(inputs=[g_theta_input], outputs=[g_theta_output])

In [65]:
def condense_batch_size_and_number_of_object_pairs(x):
    return K.reshape(x, (-1, x.shape[-1].value))

In [66]:
def expand_batch_size_and_number_of_object_pairs(x, number_of_object_pairs=100):
    return K.reshape(x, (-1, number_of_object_pairs, x.shape[-1].value))

sum_g_output sums the g_theta_outputs of each scene+question sample.

In [67]:
def sum_g_theta_outputs(g_theta_outputs):
    return K.sum(g_theta_outputs, axis=1)

## 4.3 *f_phi*

In [68]:
def f_phi(input_dim=512, F_FC1=512, F_FC2=1024, F_DROPOUT2=0.02, F_FC3=29):
    f_input = Input(shape=(input_dim,))
    x = Dense(F_FC1, activation='relu')(f_input)
    x = Dense(F_FC2, activation='relu')(x)
    x = Dropout(F_DROPOUT2)(x)
    f_output = Dense(F_FC3, activation='softmax')(x)
    return Model(inputs=[f_input], outputs=[f_output])

## 4.4 Checking relation_network using example val batch

In [69]:
# Checking
rn = relation_network()
b = rn.predict([scenes_input_val_example_batch, questions_input_val_example_batch])
print(b.shape)

(50, 29)


# 5 Training

In [None]:
# Make the relation network
rn = relation_network(max_number_of_objects_in_scene=max_number_of_objects_in_scene,
                      object_features_dim=object_features_dim,
                      max_question_length=max_question_length,
                      answers_vocabulary_length=len(answers_vocabulary),
                      WORD_EMBEDDING_DIM=WORD_EMBEDDING_DIM, LSTM_UNITS=LSTM_UNITS,
                      G_FC1=G_FC1, G_FC2=G_FC2, G_FC3=G_FC3, G_FC4=G_FC4,
                      F_FC1=F_FC1, F_FC2=F_FC2, F_DROPOUT2=F_DROPOUT2)

In [None]:
# Compile
rn.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Params
batch_size = 256
epochs = 100

In [None]:
# Fit
rn.fit(x=[scenes_input_train, questions_input_train], y=[one_hot_answers_train],
       batch_size=batch_size, epochs=epochs, callbacks=None,
       validation_data=([scenes_input_val, questions_input_val], one_hot_answers_val),
       shuffle=True, class_weight=None)

# 6 Testing

In [None]:
rn_preds_test = rn.predict([scenes_input_train, questions_input_test], batch_size=batch_size, verbose=True)

In [None]:
# Write the predictions into txt file
with open("rn_pred_test.txt", 'w') as f:
    for softmax_pred in rn_preds_test:
        f.write(answers_vocabulary[np.argmax(softmax_pred)] + '\n')