
# Description of the procedure that we have followed

We started from a simple model, having vgg as image processing convolutional network and a series of lstm layers for the text processing part; then we used the Concatenate function to merge their outputs and fed it to a series of dense layers in order to perform classification. However, we didn't reach the performance that we wanted and so we decided to change our model. 

We substituted the rnn part with bert, reaching better performances. We noticed that the dataset was really imbalanced since there were a lot of samples with "yes" or "no" as outputs (more or less half of the dataset), so we changed totally our approach to the problem. As shown in this notebook, we trained a meta-learner that classifies the answer type into two classes based on the input question (class 1 if the answer is expected to be "yes" or "no", class 0 otherwise). The benefit of this approach is to automatically balance the dataset. We then trained two different models (see the other two notebooks) based on the previous split. 

In particular, in these models we kept the architecture of bert+vgg, but we changed the way they interact, having as reference the following paper: [https://arxiv.org/abs/1606.01455](http://) (at page 2 you can find the plot of the model's structure). At the end of the other two notebooks we saved the weights of the models in order to load them in this one (we had a problem with bert and we couldn't use the "model.save" function but we had to use this approach) and make the predictions on the test based on the outcome of the meta-learner. With this approach we reached the 0.54 in accuracy. We then tried glove instead of bert, but the performance didn't improve and so we didn't change.



In [1]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import json

from tensorflow.keras.preprocessing.image import ImageDataGenerator


SEED = 1234

tf.random.set_seed(SEED)
np.random.seed(SEED)
img_w, img_h = 256,256

bs = 32
num_classes= 58

dataset_dir = '../input/anndl-2020-vqa/VQA_Dataset'
images_path = '../input/anndl-2020-vqa/VQA_Dataset/Images'


In [2]:
path = images_path

# Data preprocessing

In train_questions_annotations.json are the annotated data, having the following structure:

>{
>  '1': 
>  {'question': ..., 
>   'image_id': ...,
>   'answer': ...},
>  '2':
>  {'question': ..., 
>   'image_id': ...,
>   'answer': ...},
>  ...
>}


All the possible answers are listed below

In [3]:
labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

Retrieve the dataset as dataframe

In [4]:
dataset_dir = dataset_dir

with open(os.path.join(dataset_dir,"train_questions_annotations.json")) as f:
    dic = json.load(f)

dataframe = pd.DataFrame(dic.items())
dataframe.rename(columns = {0:'data_id', 1:'data'}, inplace = True)

dataframe.head()

Unnamed: 0,data_id,data
0,117792,"{'question': 'Who looks happier?', 'image_id':..."
1,117790,"{'question': 'Where is the woman sitting?', 'i..."
2,117791,"{'question': 'Where is the man sitting?', 'ima..."
3,55360,"{'question': 'Is this man hungry?', 'image_id'..."
4,169490,"{'question': 'Who is holding the football?', '..."


In [5]:
data_ = pd.json_normalize(dataframe['data'])
data_.rename(columns = {'image_id':'filename', 'answer':'class'}, inplace = True)

data_['filename'] = data_['filename'].apply(lambda x: str(x)+'.png')
data_['class'] = data_['class'].apply(lambda x: labels_dict[x])
data_.head()

def createDict():
    dictionary = {}
    for x in range(58):
        if x in [33, 57]:
            dictionary[x] = 1 #1 is yes or no
        else:
            dictionary[x] = 0 #0 is other answers
    return dictionary

data_light = data_.copy()
data_light.drop(columns=["filename"], inplace=True)
data_light["class"].replace(createDict(), inplace=True)

data_light.head()

Unnamed: 0,question,class
0,Who looks happier?,0
1,Where is the woman sitting?,0
2,Where is the man sitting?,0
3,Is this man hungry?,1
4,Who is holding the football?,0


In [6]:
msk = np.random.rand(len(data_light)) < 0.8
train_df = data_light[msk]
validation_df = data_light[~msk]

In [7]:
import re
seqlen = data_light['question'].apply(lambda x: len(re.findall(r'[^\w\s]|\w+', x)))
SEQ_LEN = max(seqlen.tolist()) +2 #special tokens start-end

In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # everything to lower case

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Custom dataset

In [9]:

from PIL import Image
from tensorflow.keras.utils import to_categorical


class CustomDataset(tf.keras.utils.Sequence):

    def __init__(self, max_len_question, num_classes, tokenizer, question, answer=None, bs=1):
  
        self.max_len = max_len_question
        self.num_classes = num_classes
        self.tokenizer = tokenizer
        self.question = np.array(question)
        if answer == None:
            answer = [0 for _ in range(len(question))]
        self.labels = np.array(answer)
        self.bs = bs
        
    def __len__(self):
        return int(np.floor(len(self.question) / self.bs))

    def __getitem__(self, index):
    
        
        
        # one hot encoding of the anwer
        arr_answ = np.array(self.labels[index:index+self.bs])
        ys = np.zeros((arr_answ.size, self.num_classes))
        ys[np.arange(arr_answ.size), arr_answ] = 1
        
        
        imgs = []
        
        Xids = np.zeros((self.bs, self.max_len))
        Xmasks = np.zeros((self.bs, self.max_len))
        
        for i in range(self.bs):
            
            question = self.question[index]

            tokens = self.tokenizer.encode_plus(question, max_length=self.max_len,
                             truncation=False, padding="max_length",
                             add_special_tokens=True, return_token_type_ids=False,
                             return_attention_mask=True, return_tensors='tf')
            
            Xids[i,:], Xmasks[i,:] = tokens['input_ids'], tokens['attention_mask']
           
            index += 1
 
        xs = {}
        
        xs['input_ids'] = Xids
        
        xs['attention_mask'] = Xmasks
        
        
        return xs, ys

In [10]:
class CustomDatasetTest(tf.keras.utils.Sequence):

    def __init__(self, max_len_question, num_classes, tokenizer, filename, question, answer=None, bs=1, preprocessing_function=None, out_shape=[256, 256]):
  
        self.max_len = max_len_question
        self.num_classes = num_classes
        self.tokenizer = tokenizer
        self.subset_filenames = filename
        self.question = np.array(question)
        if answer == None:
            answer = [0 for _ in range(len(filename))]
        self.labels = np.array(answer)
        self.out_shape = out_shape
        self.preprocessing_function = preprocessing_function
        self.bs = bs
        
    def __len__(self):
        return int(np.floor(len(self.subset_filenames) / self.bs))

    def __getitem__(self, index):
    
        
        
        # one hot encoding of the anwer
        arr_answ = np.array(self.labels[index:index+self.bs])
        ys = np.zeros((arr_answ.size, self.num_classes))
        ys[np.arange(arr_answ.size), arr_answ] = 1
        
        
        imgs = []
        
        Xids = np.zeros((self.bs, self.max_len))
        Xmasks = np.zeros((self.bs, self.max_len))
        
        for i in range(self.bs):
            curr_filename = self.subset_filenames[index]
            img = Image.open(os.path.join(path, curr_filename))
        
        
            # Resize image and mask
            img = img.resize(self.out_shape)

            img_arr = np.asarray(img)
            
            # the last dimension is the brightness
            img_arr = img_arr[:,:,:-1]
            
            if self.preprocessing_function is not None:
                img_arr = self.preprocessing_function(img_arr)

            imgs.append(img_arr)
            
            
            
            question = self.question[index]

            tokens = self.tokenizer.encode_plus(question, max_length=self.max_len,
                             truncation=False, padding="max_length",
                             add_special_tokens=True, return_token_type_ids=False,
                             return_attention_mask=True, return_tensors='tf')
            
            Xids[i,:], Xmasks[i,:] = tokens['input_ids'], tokens['attention_mask']
           
            index += 1
 
        xs = {}
        
        xs['input_ids'] = Xids
        
        xs['attention_mask'] = Xmasks
        
        xs['images'] = np.array(imgs)
        
        return xs, ys

# Model structure

In [11]:
from tensorflow.keras.applications.vgg16 import VGG16
from keras.models import Sequential
from tensorflow.keras.layers import Flatten 
from keras.models import Model

In [12]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:

input_ids = tf.keras.layers.Input(shape=(SEQ_LEN, ), 
                                  name='input_ids',
                                 dtype='int32') 
mask = tf.keras.layers.Input(shape=(SEQ_LEN, ), 
                             name='attention_mask',
                                 dtype='int32') 

embeddings = bert(input_ids, attention_mask = mask)[0]

maxpool = tf.keras.layers.GlobalMaxPool1D()(embeddings)
dense = tf.keras.layers.Dense(256, activation="relu")(maxpool)
y = tf.keras.layers.Dense(2, activation="sigmoid")(dense)


model_question = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# freeze bert
model_question.layers[2].trainable = False
model_question.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 26)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 26)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
global_max_pooling1d (GlobalMax (None, 768)          0           tf_bert_model[0][0]          

# Model training

In [15]:
callbacks = []

# Early Stopping
# --------------

es_callback = tf.keras.callbacks.EarlyStopping(patience=5, 
                                               monitor="val_loss",
                                              restore_best_weights = True)
reduceLR = tf.keras.callbacks.ReduceLROnPlateau(patience=3, 
                                               monitor="val_loss",)
callbacks.append(es_callback)
callbacks.append(reduceLR)
metrics = ['accuracy']

In [16]:
from tensorflow.keras.optimizers import Adam
opt = Adam(lr=3e-5)
model_question.compile(loss='categorical_crossentropy', optimizer=opt, metrics = metrics)

In [17]:
from tensorflow.keras.applications.vgg16 import preprocess_input



dataset = CustomDataset(num_classes = 2,
                        tokenizer = tokenizer,
                        max_len_question = SEQ_LEN,
                        question = train_df['question'].tolist(), 
                        answer = train_df['class'].tolist(),
                        bs = bs
                            )

dataset_valid = CustomDataset(num_classes = 2,
                        tokenizer = tokenizer,
                        max_len_question = SEQ_LEN,
                        question = validation_df['question'].tolist(), 
                        answer = validation_df['class'].tolist(),
                        bs = bs
                             )


# Model to predict yes/no

In [18]:
from tensorflow.keras.applications.vgg16 import VGG16
from keras.models import Sequential
from tensorflow.keras.layers import Flatten 

model_image_first = Sequential()

input_images_first = tf.keras.layers.Input(shape=[img_w, img_h, 3], 
                                  name='images',
                                    dtype='int32') 
vgg_first = tf.keras.applications.VGG16(include_top=False, pooling='avg')
vgg_first.trainable = False
model_image_first.add(input_images_first)
model_image_first.add(vgg_first)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [19]:
from transformers import TFAutoModel

bert_first = TFAutoModel.from_pretrained('bert-base-uncased')

input_ids_first = tf.keras.layers.Input(shape=(SEQ_LEN, ), 
                                  name='input_ids',
                                 dtype='int32') 
mask_first = tf.keras.layers.Input(shape=(SEQ_LEN, ), 
                             name='attention_mask',
                                 dtype='int32')
embeddings_first = bert_first(input_ids_first, attention_mask = mask_first)[0]

y_first = tf.keras.layers.GlobalMaxPool1D()(embeddings_first)

model_question_first = tf.keras.Model(inputs=[input_ids_first, mask_first], outputs=y_first)

#freeze bert
model_question_first.layers[2].trainable = False

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [20]:
from tensorflow.keras.layers import Dense, Concatenate, Dropout, Multiply, Add

units = 128
input_dropout = 0.3
dropout = 0.65

def create_visual_block(vgg_output, num_units, dropout):
    visual_block = Flatten()(vgg_output)
    visual_block = Dropout(input_dropout)(visual_block)    
    visual_block = Dense(units)(visual_block)
    visual_block = Dropout(dropout)(visual_block)
    visual_block = Dense(num_units, activation="tanh")(visual_block)
    visual_block = Dropout(dropout)(visual_block)
    visual_block = Dense(num_units)(visual_block)
    visual_block = Dropout(dropout)(visual_block)
    visual_block = Dense(num_units, activation="tanh")(visual_block)
    visual_block = Dropout(dropout)(visual_block)
    return visual_block

def create_question_block(question_block, num_units, dropout): #should be already flattened
    question_block = Dropout(input_dropout)(question_block)
    question_block = Dense(num_units)(question_block)
    question_block = Dropout(dropout)(question_block)
    question_block = Dense(num_units, activation="tanh")(question_block)
    question_block = Dropout(dropout)(question_block)
    return question_block

initial_question_output_first = Flatten()(model_question_first.output)
q1_first = create_question_block(initial_question_output_first, units, dropout)
v1_first = create_visual_block(model_image_first.output, units, dropout)
q1_first = Multiply()([q1_first, v1_first])
residual_first = Dense(units)(initial_question_output_first)
H1_first = Add()([residual_first, q1_first])

q2_first = create_question_block(H1_first, units, dropout)
v2_first = create_visual_block(model_image_first.output, units, dropout)
q2_first = Multiply()([q2_first, v2_first])
residual_first = Dense(units)(H1_first)
H2_first = Add()([residual_first, q2_first])

q3_first = create_question_block(H2_first, units, dropout)
v3_first = create_visual_block(model_image_first.output, units, dropout)
q3_first = Multiply()([q3_first, v3_first])
residual_first = Dense(units)(H2_first)
H3_first = Add()([residual_first, q3_first])

mergedOut_first = Dense(units, activation="tanh")(H3_first)
mergedOut_first = Dense(num_classes, activation='softmax')(mergedOut_first)

In [21]:
model_first = Model([model_question_first.input[0],model_question_first.input[1], model_image_first.input], mergedOut_first)

model_first.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
images (InputLayer)             [(None, 256, 256, 3) 0                                            
__________________________________________________________________________________________________
vgg16 (Functional)              (None, 512)          14714688    images[0][0]                     
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 512)          0           vgg16[0][0]                      
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 26)]         0                                            
____________________________________________________________________________________________

In [24]:
model_first.load_weights('../input/model-binary-weights/model_binary.h5')

# Model for other labels

In [25]:
model_image_second = Sequential()

input_images_second = tf.keras.layers.Input(shape=[img_w, img_h, 3], 
                                  name='images',
                                    dtype='int32') 
vgg_second = tf.keras.applications.VGG16(include_top=False, pooling='avg')
vgg_second.trainable = False
model_image_second.add(input_images_second)
model_image_second.add(vgg_second)

In [26]:
bert_second = TFAutoModel.from_pretrained('bert-base-uncased')

input_ids_second = tf.keras.layers.Input(shape=(SEQ_LEN, ), 
                                  name='input_ids',
                                 dtype='int32')
mask_second = tf.keras.layers.Input(shape=(SEQ_LEN, ), 
                             name='attention_mask',
                                 dtype='int32') 

embeddings_second = bert_second(input_ids_second, attention_mask = mask_second)[0]

y_second = tf.keras.layers.GlobalMaxPool1D()(embeddings_second)

model_question_second = tf.keras.Model(inputs=[input_ids_second, mask_second], outputs=y_second)

#freeze bert
model_question_second.layers[2].trainable = False

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [27]:
from tensorflow.keras.layers import Dense, Concatenate, Dropout, Multiply, Add

units = 128
dropout = 0.5

def create_visual_block_second(vgg_output, num_units, dropout):
    visual_block = Flatten()(vgg_output)
    visual_block = Dense(num_units)(visual_block)
    visual_block = Dropout(dropout)(visual_block)
    visual_block = Dense(num_units, activation="tanh")(visual_block)
    visual_block = Dropout(dropout)(visual_block)
    visual_block = Dense(num_units)(visual_block)
    visual_block = Dropout(dropout)(visual_block)
    visual_block = Dense(num_units, activation="tanh")(visual_block)
    visual_block = Dropout(dropout)(visual_block)
    return visual_block

def create_question_block_second(question_block, num_units, dropout): #should be already flattened
    question_block = Dense(num_units)(question_block)
    question_block = Dropout(dropout)(question_block)
    question_block = Dense(num_units, activation="tanh")(question_block)
    question_block = Dropout(dropout)(question_block)
    return question_block

initial_question_output_second = Flatten()(model_question_second.output)
q1_second = create_question_block_second(initial_question_output_second, units, dropout)
v1_second = create_visual_block_second(model_image_second.output, units, dropout)
q1_second = Multiply()([q1_second, v1_second])
residual_second = Dense(units)(initial_question_output_second)
H1_second = Add()([residual_second, q1_second])

q2_second = create_question_block_second(H1_second, units, dropout)
v2_second = create_visual_block_second(model_image_second.output, units, dropout)
q2_second = Multiply()([q2_second, v2_second])
residual_second = Dense(units)(H1_second)
H2_second = Add()([residual_second, q2_second])

q3_second = create_question_block_second(H2_second, units, dropout)
v3_second = create_visual_block_second(model_image_second.output, units, dropout)
q3_second = Multiply()([q3_second, v3_second])
residual_second = Dense(units)(H2_second)
H3_second = Add()([residual_second, q3_second])

mergedOut_second = Dense(units)(H3_second)
mergedOut_second = Dense(num_classes, activation='softmax')(mergedOut_second)

In [28]:
model_second = Model([model_question_second.input[0],model_question_second.input[1], model_image_second.input], mergedOut_second)

model_second.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
images (InputLayer)             [(None, 256, 256, 3) 0                                            
__________________________________________________________________________________________________
vgg16 (Functional)              (None, 512)          14714688    images[0][0]                     
__________________________________________________________________________________________________
flatten_5 (Flatten)             (None, 512)          0           vgg16[0][0]                      
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 26)]         0                                            
____________________________________________________________________________________________

In [30]:
model_second.load_weights('../input/model-multiclass-weights/model_multiclass.h5')

# Training the meta-learner

In [31]:
train_steps = len(dataset)
val_steps = len(dataset_valid)
model_question.fit_generator(generator=dataset, 
                    validation_data = dataset_valid, 
                    steps_per_epoch=train_steps,
                    validation_steps=val_steps,
                    epochs=3,
                    verbose=1
                            )



Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fb7cc042490>

# Prediction

In [32]:
with open(os.path.join(dataset_dir,"test_questions.json")) as f:
    dic = json.load(f)

dataframe_test = pd.DataFrame(dic.items())
dataframe_test.rename(columns = {0:'data_id', 1:'data'}, inplace = True)

dataframe_test.head()

Unnamed: 0,data_id,data
0,169491,"{'question': 'How is the weather?', 'image_id'..."
1,33711,"{'question': 'What is the woman holding?', 'im..."
2,100051,"{'question': 'How many pillows?', 'image_id': ..."
3,15271,"{'question': 'Is the man falling?', 'image_id'..."
4,13291,"{'question': 'How many logs on the grass?', 'i..."


In [33]:
data_test = pd.json_normalize(dataframe_test['data'])
data_test.rename(columns = {'image_id':'filename'}, inplace = True)

data_test['filename'] = data_test['filename'].apply(lambda x: str(x)+'.png')
data_test.head()

Unnamed: 0,question,filename
0,How is the weather?,16949.png
1,What is the woman holding?,3371.png
2,How many pillows?,10005.png
3,Is the man falling?,1527.png
4,How many logs on the grass?,1329.png


In [34]:
data_test_light = data_test.copy()
data_test_light.drop(columns=["filename"], inplace=True)

data_test_light.head()

Unnamed: 0,question
0,How is the weather?
1,What is the woman holding?
2,How many pillows?
3,Is the man falling?
4,How many logs on the grass?


In [35]:
test_encoded_question = data_test['question'].tolist()

In [36]:
test_generator_light = CustomDataset(max_len_question=SEQ_LEN,
                               num_classes = 2, 
                               tokenizer = tokenizer,
                               question = test_encoded_question,
                               bs = 1
                              )

test_generator = CustomDatasetTest(max_len_question=SEQ_LEN,
                               num_classes = num_classes, 
                               tokenizer = tokenizer,
                               question = test_encoded_question,
                               filename = data_test['filename'].tolist(),
                               bs = 1
                              )


In [37]:
import datetime 
nb_samples = len(data_test['filename'].tolist())
predict = model_question.predict_generator(test_generator_light,steps = len(test_generator_light))



In [38]:
array_type_answer = {}
i = 0
for p in predict:
    array_type_answer[i] = p.argmax(axis=-1)
    i+=1

# Submission

In [39]:
import os
from datetime import datetime

def create_csv(array_type_answer, model_first, model_second, test_generator, data_id_list, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')
        i = 0
        for key, value in array_type_answer.items():
            if(value == 1):
                prediction = model_first.predict_generator(test_generator[i][0], steps=1)
                to_write = prediction.argmax(axis=-1)[0]
            elif(value == 0):
                prediction = model_second.predict_generator(test_generator[i][0], steps=1)
                to_write = prediction.argmax(axis=-1)[0]
            f.write(data_id_list[i] + ',' + str(to_write) + '\n')
            i+=1

In [40]:
data_id_list = dataframe_test['data_id']
create_csv(array_type_answer, model_first, model_second, test_generator, data_id_list, '/kaggle/working/')