### Imports and setup

USER NOTE: If you intend to train the NLG model, make sure the colab is running on a GPU. You can check this under Edit -> Notebook Settings -> Hardware accelerator.

### NLG Component

In [22]:
# Import drive with text
import functions as f

import tensorflow
import keras

from Text import *
from LSTM_class import *

from keras import layers, models, optimizers

import re
import time

#### preprocessing

In [23]:
content_path = '.'

path_train = content_path + '/data/train.txt'

input_train = f.read_txt(path_train)

In [24]:
# we create two training sets from the same corpus, one containing every word
# of the corpus in the order they were written, and another containing all of
# the words of the corpus in reverse order.

max_len = 4
step = 3

text_train_forward = Text(input_train, reverse=False)
text_train_reverse = Text(input_train, reverse=True)
text_train_forward.tokens_info()

seq_train_forward = Sequences(text_train_forward, max_len, step)
seq_train_reverse = Sequences(text_train_reverse, max_len, step)
seq_train_forward.sequences_info()

total tokens: 1428900, distinct tokens: 42415
number of sequences of length 4: 476299


In [25]:
classes = ['0','person','bicycle','car','motorcycle','airplane','bus','train','truck','boat','traffic light','fire hydrant','12','stop sign','parking meter','bench','bird','cat','dog','horse','sheep','cow','elephant','bear','zebra','giraffe','26','backpack','umbrella','29','30','handbag','tie','suitcase','frisbee','skis','snowboard','sports ball','kite','baseball bat','baseball glove','skateboard','surfboard','tennis racket','bottle','45','wine glass','cup','fork','knife','spoon','bowl','banana','apple','sandwich','orange','broccoli','carrot','hot dog','pizza','donut','cake','chair','couch','potted plant','bed','66','dining table','68','69','toilet','71','tv','laptop','mouse','remote','keyboard','cell phone','microwave','oven','toaster','sink','refrigerator','83','book','clock','vase','scissors','teddy bear','hair drier','toothbrush']

def keyword_in_corpus(keyword, corpus=text_train_forward):
  subwords = keyword.split(' ')  # some COCO keywords are actually two words
  # TODO: two digit numbers should also be considered as two individual digits?
  flag = True
  for subword in subwords:
    flag = flag and subword in corpus.token2ind.keys()
  return flag

def validate_corpus(corpus):
  """Returns a list of any tokens which might be detected in an image by the
  vision model, but which are not in the vocabulary of this corpus. Ideally,
  this list should only contain the number 0."""
  missing_vocab = []
  present_vocab = []
  for word in classes:
    l = present_vocab if keyword_in_corpus(word, corpus) else missing_vocab
    l.append(word)
  return {'missing':missing_vocab, 'present':present_vocab}

In [26]:
attendance = validate_corpus(text_train_forward)
print("corpus contains", len(attendance['present']), "MSCOCO keywords, out of", len(classes), "--- this is about", int(10000*(len(attendance['present'])/len(classes)))/100, "% attendance")

corpus contains 70 MSCOCO keywords, out of 91 --- this is about 76.92 % attendance


In [27]:
print(attendance['missing'])

['motorcycle', 'airplane', 'fire hydrant', 'zebra', 'giraffe', 'backpack', 'frisbee', 'skis', 'snowboard', 'kite', 'skateboard', 'surfboard', 'broccoli', 'pizza', 'donut', 'tv', 'laptop', 'keyboard', 'microwave', 'toaster', 'teddy bear']


In [28]:
print(text_train_forward.tokens[:10])
print(text_train_forward.tokens_ind[:10], '\n')
np.array(seq_train_forward.sequences[:3])

['I', 'entered', 'this', 'incarnation', 'on', 'March', 'the', 'twenty', '-', 'ninth']
[13025, 20355, 19524, 35998, 35687, 25002, 40649, 35065, 41940, 31725] 



array([[13025, 20355, 19524, 35998],
       [35998, 35687, 25002, 40649],
       [40649, 35065, 41940, 31725]])

The reverse sequences are not necessarily exact reverses of the forward sequences because the total number of tokens in the corpus doesn't necessariy divide evenly into 4-word subsequences, so one to three words may be left off of the end.

In [29]:
print(text_train_reverse.tokens[-10:])
print(text_train_reverse.tokens_ind[-10:], '\n')
np.array(seq_train_reverse.sequences[-3:])

['ninth', '-', 'twenty', 'the', 'March', 'on', 'incarnation', 'this', 'entered', 'I']
[31725, 41940, 35065, 40649, 25001, 35687, 35998, 19522, 20353, 13026] 



array([[ 7072,  5312, 31725, 41940],
       [41940, 35065, 40649, 25001],
       [25001, 35687, 35998, 19522]])

#### model setup

In [30]:
batch_size = 4096

params = {
  'sequence_length': max_len,
  'vocab_size': len(text_train_forward),
  'batch_size': batch_size,
  'shuffle': True,
  'embedding': True
}

train_generator_forward = TextDataGenerator(seq_train_forward.sequences, seq_train_forward.next_words, **params)
train_generator_reverse = TextDataGenerator(seq_train_reverse.sequences, seq_train_reverse.next_words, **params)

In [31]:
def lstm_model(sequence_length, vocab_size, layer_size, embedding=False):
  model = models.Sequential()
  if embedding:
    model.add(layers.Embedding(vocab_size, layer_size))
    model.add(layers.LSTM(layer_size))    
  else:
    model.add(layers.LSTM(layer_size, input_shape=(sequence_length, vocab_size)))
  model.add(layers.Dropout(0.3))
  model.add(layers.Dense(vocab_size, activation='softmax'))
  return model

#### model training (with embedding layer)

In [32]:
import json
import os

In [33]:
optimizer = optimizers.RMSprop(lr=0.01)
epochs = 40

In [34]:
model_nlg_forward = lstm_model(max_len, len(text_train_forward), 512, embedding=True)
model_nlg_forward.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [35]:
nanotime = str(time.time_ns())  # uses the nano time as a simple version label
version_path = content_path + '/out/model_' + str(nanotime) + "/"
os.mkdir(version_path)

with open(version_path + 'word_mapping_forward.json', 'w') as word_map_file:
  word_map_file.write(json.dumps(text_train_forward.token2ind))
with open(version_path + 'word_mapping_reverse.json', 'w') as word_map_file:
  word_map_file.write(json.dumps(text_train_reverse.token2ind))

In [36]:
model_nlg_forward.fit(train_generator_forward,
              steps_per_epoch=len(train_generator_forward),
              epochs=epochs,
              verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x228c0449510>

In [37]:
model_nlg_forward.save(version_path + 'model_nlg_forward')



INFO:tensorflow:Assets written to: ./out/model_1670852439439527500/model_nlg_forward\assets


INFO:tensorflow:Assets written to: ./out/model_1670852439439527500/model_nlg_forward\assets


In [38]:
model_nlg_reverse = lstm_model(max_len, len(text_train_reverse), 512, embedding=True)
model_nlg_reverse.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [39]:
model_nlg_reverse.fit(train_generator_reverse,
              steps_per_epoch=len(train_generator_reverse),
              epochs=epochs,
              verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x228a7d5f430>

In [40]:
model_nlg_reverse.save(version_path + 'model_nlg_reverse')



INFO:tensorflow:Assets written to: ./out/model_1670852439439527500/model_nlg_reverse\assets


INFO:tensorflow:Assets written to: ./out/model_1670852439439527500/model_nlg_reverse\assets


In [41]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12139450098239292644
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 6277824512
locality {
  bus_id: 1
  links {
  }
}
incarnation: 4768076950934225043
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2070 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5"
xla_global_id: 416903419
]
