# Boilerplate

In [1]:
# files
TRAINING_DIRECTORY = 'cnn/stories/'
EXTENSION = '.story'
MAX_FILES = 1

# tokenization
FILTERS = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'  # default minus >, <
TARGET_BEGIN_CHAR = '<target-begin>'
END_CHAR = '<end>'
OOV_CHAR = '<unk>'
TARGET_BEGIN_TOKEN = 1
END_TOKEN = 2
OOV_TOKEN = 3
NUM_WORDS = 600

# MODEL_PARAMS
MAX_SEQUENCE_LEN = 100

# Read in files

In [2]:
import glob

In [3]:
FILES = glob.glob('%s/*%s' % (TRAINING_DIRECTORY, EXTENSION))
print(len(FILES))
FILES

92579


['cnn/stories/0001d1afc246a7964130f43ae940af6bc6c57f01.story',
 'cnn/stories/0002095e55fcbd3a2f366d9bf92a95433dc305ef.story',
 'cnn/stories/00027e965c8264c35cc1bc55556db388da82b07f.story',
 'cnn/stories/0002c17436637c4fe1837c935c04de47adb18e9a.story',
 'cnn/stories/0003ad6ef0c37534f80b55b4235108024b407f0b.story',
 'cnn/stories/0004306354494f090ee2d7bc5ddbf80b63e80de6.story',
 'cnn/stories/0005d61497d21ff37a17751829bd7e3b6e4a7c5c.story',
 'cnn/stories/0006021f772fad0aa78a977ce4a31b3faa6e6fe5.story',
 'cnn/stories/00083697263e215e5e7eda753070f08aa374dd45.story',
 'cnn/stories/000940f2bb357ac04a236a232156d8b9b18d1667.story',
 'cnn/stories/0009ebb1967511741629926ef9f5faea2bb6be24.story',
 'cnn/stories/000c835555db62e319854d9f8912061cdca1893e.story',
 'cnn/stories/000ca3fc9d877f8d4bb2ebd1d6858c69be571fd8.story',
 'cnn/stories/000cd1ee0098c4d510a03ddc97d11764448ebac2.story',
 'cnn/stories/000e009f6b1d954d827c9a550f3f24a5474ee82b.story',
 'cnn/stories/001097a19e2c96de11276b3cce11566ccfed0030.

In [4]:
FILES = FILES[:MAX_FILES]

# Define method for generating text from files

In [5]:
def preprocessor(text):
    table = {ord(c): None for c in '<>'}
    text = text.translate(table)
    return text

In [6]:
def text_generator(files, preprocessor=None):
    for f in files:
        text = open(f).read()
        if preprocessor is not None:
            text = preprocessor(text)
        # remove highlights
        body, highlight1, *_ = text.split('@highlight')
        yield body, highlight1

In [7]:
def tokenize(input_text, target_text, tokenizer, maxlen, target_begin_token, end_token):
    input_tokens = tokenizer([input_text])[0]
    target_tokens = tokenizer([target_text])[0]
    input_tokens = input_tokens[:maxlen-len(target_tokens)-2]
    return [input_tokens + [target_begin_token] + target_tokens + [end_token]]

In [8]:
next(text_generator(FILES, preprocessor=preprocessor))

('It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria.\n\nObama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.\n\nThe proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction."\n\nIt\'s a step that is set to turn an international crisis into a fierce domestic political battle.\n\nThere are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react?\n\nIn a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but

# Initialize tokenizer

In [9]:
from keras.preprocessing.text import text_to_word_sequence, Tokenizer as _Tokenizer

class Tokenizer(_Tokenizer):
    def fit_on_texts(self, texts):
        """Updates internal vocabulary based on a list of texts.
        In the case where texts contains lists, we assume each entry of the lists
        to be a token.
        Required before using `texts_to_sequences` or `texts_to_matrix`.
        # Arguments
            texts: can be a list of strings,
                a generator of strings (for memory-efficiency),
                or a list of list of strings.
        """
        for text in texts:
            self.document_count += 1
            if self.char_level or isinstance(text, list):
                seq = text
            else:
                seq = text_to_word_sequence(text,
                                            self.filters,
                                            self.lower,
                                            self.split)
            for w in seq:
                if w in self.word_counts:
                    self.word_counts[w] += 1
                else:
                    self.word_counts[w] = 1
            for w in set(seq):
                if w in self.word_docs:
                    self.word_docs[w] += 1
                else:
                    self.word_docs[w] = 1

        wcounts = list(self.word_counts.items())
        wcounts.sort(key=lambda x: x[1], reverse=True)
        sorted_voc = [wc[0] for wc in wcounts]
        # note that index 0, 1, 2 is reserved, never assigned to an existing word
        self.word_index = dict(list(zip(sorted_voc, list(range(4, len(sorted_voc) + 4)))))
        self.word_index[self.oov_token] = 3

        for w, c in list(self.word_docs.items()):
            self.index_docs[self.word_index[w]] = c

    def texts_to_sequences_generator(self, texts):
        """Transforms each text in `texts` in a sequence of integers.
        Each item in texts can also be a list, in which case we assume each item of that list
        to be a token.
        Only top "num_words" most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.
        # Arguments
            texts: A list of texts (strings).
        # Yields
            Yields individual sequences.
        """
        num_words = self.num_words
        for text in texts:
            if self.char_level or isinstance(text, list):
                seq = text
            else:
                seq = text_to_word_sequence(text,
                                            self.filters,
                                            self.lower,
                                            self.split)
            vect = []
            for w in seq:
                i = self.word_index.get(w)
                if i is not None and (self.num_words and i < self.num_words):
                    vect.append(i)
                elif self.oov_token is not None:
                    i = self.word_index.get(self.oov_token)
                    if i is not None:
                        vect.append(i)
            yield vect

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
TOKENIZER = Tokenizer(
    num_words=NUM_WORDS,
    filters=FILTERS,  # no newline
    oov_token=OOV_CHAR)

In [11]:
gen = text_generator(FILES, preprocessor=preprocessor)

In [12]:
%%time
TOKENIZER.fit_on_texts(text for train_pair in gen for text in train_pair)

CPU times: user 2.51 ms, sys: 705 µs, total: 3.22 ms
Wall time: 3.5 ms


In [13]:
TOKENIZER.num_words

600

In [14]:
TOKENIZER.document_count

2

In [15]:
len(TOKENIZER.word_index), TOKENIZER.word_index

(604,
 {"'disaster'": 601,
  "'this": 336,
  '000': 588,
  '1': 209,
  '160': 469,
  '21': 154,
  '350': 587,
  '400': 595,
  '429': 594,
  '5': 418,
  '63': 472,
  '9': 260,
  '<unk>': 3,
  'a': 6,
  'able': 326,
  'according': 554,
  'accountable': 412,
  'act': 389,
  'action': 23,
  'actions': 246,
  'address': 135,
  'administration': 387,
  'advisers': 338,
  'affairs': 501,
  'after': 72,
  'against': 54,
  'aggression': 560,
  'agreed': 144,
  'aim': 280,
  'al': 116,
  'alexei': 498,
  'all': 202,
  'alleged': 129,
  'allied': 381,
  'allies': 173,
  'alternative': 331,
  'ambassador': 577,
  'americans': 510,
  'amid': 343,
  'among': 194,
  'an': 28,
  'analyst': 526,
  'analyze': 165,
  'anchor': 558,
  'and': 9,
  'angeles': 521,
  'announcement': 428,
  'announcing': 214,
  'anti': 505,
  'any': 117,
  'appeared': 201,
  'applauded': 442,
  'approve': 219,
  'arabs': 567,
  'are': 42,
  'argued': 371,
  "army's": 538,
  'around': 180,
  'as': 143,
  'ascertain': 286,
  'a

In [16]:
index_to_word = {v: k for k, v in TOKENIZER.word_index.items()}
index_to_word[0] = '<pad>'
index_to_word[TARGET_BEGIN_TOKEN] = TARGET_BEGIN_CHAR
index_to_word[END_TOKEN] = END_CHAR

In [17]:
sorted(index_to_word.items(), key=lambda x: x[0])

[(0, '<pad>'),
 (1, '<target-begin>'),
 (2, '<end>'),
 (3, '<unk>'),
 (4, 'the'),
 (5, 'to'),
 (6, 'a'),
 (7, 'in'),
 (8, 'of'),
 (9, 'and'),
 (10, 'said'),
 (11, 'on'),
 (12, 'u'),
 (13, 'syria'),
 (14, 'that'),
 (15, 'obama'),
 (16, 'military'),
 (17, 'is'),
 (18, 'syrian'),
 (19, 'for'),
 (20, 'weapons'),
 (21, 's'),
 (22, 'he'),
 (23, 'action'),
 (24, "obama's"),
 (25, 'saturday'),
 (26, 'chemical'),
 (27, 'congress'),
 (28, 'an'),
 (29, 'what'),
 (30, 'be'),
 (31, 'president'),
 (32, 'n'),
 (33, 'will'),
 (34, 'would'),
 (35, 'has'),
 (36, 'have'),
 (37, 'this'),
 (38, 'by'),
 (39, 'with'),
 (40, 'over'),
 (41, 'from'),
 (42, 'are'),
 (43, 'his'),
 (44, 'who'),
 (45, 'was'),
 (46, 'use'),
 (47, 'take'),
 (48, 'inspectors'),
 (49, 'attack'),
 (50, 'it'),
 (51, 'united'),
 (52, 'state'),
 (53, 'lawmakers'),
 (54, 'against'),
 (55, 'or'),
 (56, 'political'),
 (57, 'debate'),
 (58, 'no'),
 (59, 'government'),
 (60, 'not'),
 (61, 'i'),
 (62, 'more'),
 (63, 'top'),
 (64, 'used'),
 (65, 

In [18]:
TOKENIZER.num_words = min(len(TOKENIZER.word_index)+1, TOKENIZER.num_words)

In [19]:
gen = text_generator(FILES)
x, y = next(gen)

In [20]:
len(x)

9442

In [21]:
x

'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria.\n\nObama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons.\n\nThe proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction."\n\nIt\'s a step that is set to turn an international crisis into a fierce domestic political battle.\n\nThere are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react?\n\nIn a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -- but 

In [22]:
seq = tokenize(
    x,
    y,
    TOKENIZER.texts_to_sequences,
    maxlen=MAX_SEQUENCE_LEN,
    target_begin_token=TARGET_BEGIN_TOKEN,
    end_token=END_TOKEN)
seq

[[93,
  68,
  12,
  21,
  31,
  124,
  15,
  69,
  53,
  5,
  125,
  7,
  11,
  94,
  5,
  46,
  16,
  70,
  7,
  13,
  15,
  210,
  6,
  211,
  5,
  4,
  212,
  8,
  4,
  71,
  9,
  126,
  11,
  25,
  95,
  213,
  72,
  214,
  14,
  22,
  215,
  16,
  23,
  54,
  18,
  216,
  17,
  4,
  127,
  128,
  5,
  47,
  40,
  4,
  129,
  46,
  8,
  26,
  20,
  4,
  130,
  217,
  41,
  15,
  218,
  27,
  5,
  219,
  4,
  46,
  8,
  16,
  70,
  5,
  220,
  221,
  222,
  9,
  223,
  4,
  224,
  19,
  1,
  18,
  68,
  15,
  205,
  5,
  4,
  63,
  8,
  4,
  206,
  3,
  101,
  77,
  5,
  92,
  207,
  2]]

In [23]:
[[index_to_word[i] for i in L] for L in seq]

[["it's",
  'official',
  'u',
  's',
  'president',
  'barack',
  'obama',
  'wants',
  'lawmakers',
  'to',
  'weigh',
  'in',
  'on',
  'whether',
  'to',
  'use',
  'military',
  'force',
  'in',
  'syria',
  'obama',
  'sent',
  'a',
  'letter',
  'to',
  'the',
  'heads',
  'of',
  'the',
  'house',
  'and',
  'senate',
  'on',
  'saturday',
  'night',
  'hours',
  'after',
  'announcing',
  'that',
  'he',
  'believes',
  'military',
  'action',
  'against',
  'syrian',
  'targets',
  'is',
  'the',
  'right',
  'step',
  'to',
  'take',
  'over',
  'the',
  'alleged',
  'use',
  'of',
  'chemical',
  'weapons',
  'the',
  'proposed',
  'legislation',
  'from',
  'obama',
  'asks',
  'congress',
  'to',
  'approve',
  'the',
  'use',
  'of',
  'military',
  'force',
  'to',
  'deter',
  'disrupt',
  'prevent',
  'and',
  'degrade',
  'the',
  'potential',
  'for',
  '<target-begin>',
  'syrian',
  'official',
  'obama',
  'climbed',
  'to',
  'the',
  'top',
  'of',
  'the',
  '

In [24]:
len(seq), len(seq[0])

(1, 100)

In [25]:
from keras.preprocessing.sequence import pad_sequences
pad_sequences(seq)

array([[ 93,  68,  12,  21,  31, 124,  15,  69,  53,   5, 125,   7,  11,
         94,   5,  46,  16,  70,   7,  13,  15, 210,   6, 211,   5,   4,
        212,   8,   4,  71,   9, 126,  11,  25,  95, 213,  72, 214,  14,
         22, 215,  16,  23,  54,  18, 216,  17,   4, 127, 128,   5,  47,
         40,   4, 129,  46,   8,  26,  20,   4, 130, 217,  41,  15, 218,
         27,   5, 219,   4,  46,   8,  16,  70,   5, 220, 221, 222,   9,
        223,   4, 224,  19,   1,  18,  68,  15, 205,   5,   4,  63,   8,
          4, 206,   3, 101,  77,   5,  92, 207,   2]], dtype=int32)

In [26]:
[[index_to_word[i] for i in x] for x in pad_sequences(seq)]

[["it's",
  'official',
  'u',
  's',
  'president',
  'barack',
  'obama',
  'wants',
  'lawmakers',
  'to',
  'weigh',
  'in',
  'on',
  'whether',
  'to',
  'use',
  'military',
  'force',
  'in',
  'syria',
  'obama',
  'sent',
  'a',
  'letter',
  'to',
  'the',
  'heads',
  'of',
  'the',
  'house',
  'and',
  'senate',
  'on',
  'saturday',
  'night',
  'hours',
  'after',
  'announcing',
  'that',
  'he',
  'believes',
  'military',
  'action',
  'against',
  'syrian',
  'targets',
  'is',
  'the',
  'right',
  'step',
  'to',
  'take',
  'over',
  'the',
  'alleged',
  'use',
  'of',
  'chemical',
  'weapons',
  'the',
  'proposed',
  'legislation',
  'from',
  'obama',
  'asks',
  'congress',
  'to',
  'approve',
  'the',
  'use',
  'of',
  'military',
  'force',
  'to',
  'deter',
  'disrupt',
  'prevent',
  'and',
  'degrade',
  'the',
  'potential',
  'for',
  '<target-begin>',
  'syrian',
  'official',
  'obama',
  'climbed',
  'to',
  'the',
  'top',
  'of',
  'the',
  '

In [27]:
s = seq[0]
s

[93,
 68,
 12,
 21,
 31,
 124,
 15,
 69,
 53,
 5,
 125,
 7,
 11,
 94,
 5,
 46,
 16,
 70,
 7,
 13,
 15,
 210,
 6,
 211,
 5,
 4,
 212,
 8,
 4,
 71,
 9,
 126,
 11,
 25,
 95,
 213,
 72,
 214,
 14,
 22,
 215,
 16,
 23,
 54,
 18,
 216,
 17,
 4,
 127,
 128,
 5,
 47,
 40,
 4,
 129,
 46,
 8,
 26,
 20,
 4,
 130,
 217,
 41,
 15,
 218,
 27,
 5,
 219,
 4,
 46,
 8,
 16,
 70,
 5,
 220,
 221,
 222,
 9,
 223,
 4,
 224,
 19,
 1,
 18,
 68,
 15,
 205,
 5,
 4,
 63,
 8,
 4,
 206,
 3,
 101,
 77,
 5,
 92,
 207,
 2]

In [28]:
one_hot = TOKENIZER.sequences_to_matrix([[i] for i in s])

In [29]:
one_hot

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [30]:
# only one per row
import numpy as np
np.argwhere(one_hot == 1)[:50]

array([[  0,  93],
       [  1,  68],
       [  2,  12],
       [  3,  21],
       [  4,  31],
       [  5, 124],
       [  6,  15],
       [  7,  69],
       [  8,  53],
       [  9,   5],
       [ 10, 125],
       [ 11,   7],
       [ 12,  11],
       [ 13,  94],
       [ 14,   5],
       [ 15,  46],
       [ 16,  16],
       [ 17,  70],
       [ 18,   7],
       [ 19,  13],
       [ 20,  15],
       [ 21, 210],
       [ 22,   6],
       [ 23, 211],
       [ 24,   5],
       [ 25,   4],
       [ 26, 212],
       [ 27,   8],
       [ 28,   4],
       [ 29,  71],
       [ 30,   9],
       [ 31, 126],
       [ 32,  11],
       [ 33,  25],
       [ 34,  95],
       [ 35, 213],
       [ 36,  72],
       [ 37, 214],
       [ 38,  14],
       [ 39,  22],
       [ 40, 215],
       [ 41,  16],
       [ 42,  23],
       [ 43,  54],
       [ 44,  18],
       [ 45, 216],
       [ 46,  17],
       [ 47,   4],
       [ 48, 127],
       [ 49, 128]])

# Define batch generator

In [31]:
def sequencer(tokens):
    return [tokens[:i] for i in range(1, len(tokens)+1)]

In [32]:
list(sequencer('a quick brown fox'))

['a',
 'a ',
 'a q',
 'a qu',
 'a qui',
 'a quic',
 'a quick',
 'a quick ',
 'a quick b',
 'a quick br',
 'a quick bro',
 'a quick brow',
 'a quick brown',
 'a quick brown ',
 'a quick brown f',
 'a quick brown fo',
 'a quick brown fox']

In [33]:
import random
import numpy as np

class BatchGenerator:
    def __init__(self, files, tokenizer, maxlen, batch_size, target_begin_token, end_token, epoch_end=None):
        self.files = files
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.batch_size = batch_size
        self.target_begin_token = target_begin_token
        self.end_token = end_token
        self.epoch_end = epoch_end
        
    def generate(self):
        steps = []
        while True:
            random.shuffle(self.files)
            for input_text, target_text in self.iter_files(self.files):
                tokens = self.tokenize(input_text, target_text)

                for seq_tokens in self.sequence(tokens):
                    steps.append(seq_tokens)
                    
                while len(steps) >= self.batch_size:
                    batch = steps[:self.batch_size]
                    X = pad_sequences(batch, maxlen=self.maxlen)
                    y = self.tokenizer.sequences_to_matrix([[i] for s in X for i in s])
                    y = y.reshape((self.batch_size, self.maxlen, self.tokenizer.num_words))
                    
                    # offset
                    X = X[:-1]
                    y = y[1:]
                    yield X, y
                    
                    # reset
                    steps = steps[self.batch_size:]
            yield self.epoch_end
        
    def preprocess(self, text):
        # replace all occurences of multiple newlines and replace them
        # with a single newline padded with spaces so it is treated as a
        # token
        text = ' \n '.join(t for t in text.split('\n') if t)
        table = {ord(c): None for c in '<>'}
        text = text.translate(table)
        return text

    def iter_files(self, files):
        for f in files:
            text = open(f).read()
            text = self.preprocess(text)
            # remove highlights
            body, highlight1, *_ = text.split('@highlight')
            yield body, highlight1
        
    def tokenize(self, input_text, target_text):
        input_tokens = self.tokenizer.texts_to_sequences([input_text])[0]
        target_tokens = self.tokenizer.texts_to_sequences([target_text])[0]
        input_tokens = input_tokens[:self.maxlen-len(target_tokens)-2]
        return input_tokens + [self.target_begin_token] + target_tokens + [self.end_token]

    def sequence(self, tokens):
        return [tokens[:i] for i in range(1, len(tokens)+1)] 

In [34]:
batch_gen = BatchGenerator(
    files=FILES,
    tokenizer=TOKENIZER,
    maxlen=MAX_SEQUENCE_LEN,
    batch_size=32,
    target_begin_token=TARGET_BEGIN_TOKEN,
    end_token=END_TOKEN).generate()

In [35]:
X, y = next(batch_gen)

In [36]:
X.shape, y.shape

((31, 100), (31, 100, 600))

In [37]:
X

array([[  0,   0,   0, ...,   0,   0,  93],
       [  0,   0,   0, ...,   0,  93,  68],
       [  0,   0,   0, ...,  93,  68,  12],
       ...,
       [  0,   0,   0, ..., 212,   8,   4],
       [  0,   0,   0, ...,   8,   4,  71],
       [  0,   0,   0, ...,   4,  71,   9]], dtype=int32)

In [38]:
y

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [39]:
[' '.join([index_to_word.get(i, '<pad>') for i in x]) for x in X]

["<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> it's",
 "<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [40]:
# only one per row
import numpy as np
ys = np.argwhere(y[0] == 1)

In [41]:
import numpy as np
for j in range(0, len(y), 5):
    ys = np.argwhere(y[j] == 1)
    assert len(ys) == len({row for row, idx in ys})
    print(' '.join(index_to_word[idx] for row, idx in ys))

<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> it's official
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p

# Training

In [42]:
N_HEADS = 8
N_LAYERS = 6
D_MODEL = 64*N_HEADS
VOCAB_SIZE = TOKENIZER.num_words
WARMUP_STEPS = 200

In [43]:
batch_gen = BatchGenerator(
    files=FILES,
    tokenizer=TOKENIZER,
    maxlen=MAX_SEQUENCE_LEN,
    batch_size=32,
    target_begin_token=TARGET_BEGIN_TOKEN,
    end_token=END_TOKEN
).generate()

In [44]:
# loop over batch generator until we hit the end of the epoch
# to calculate number of batches in epoch and compute some
# stats along the way
steps_per_epoch = 0
for batch in batch_gen:
    if batch is None:
        break
    steps_per_epoch += 1

In [45]:
print('steps per epoch', steps_per_epoch)

steps per epoch 3


In [46]:
train_gen = (X for X in batch_gen if not X == 0)

In [47]:
from keras.callbacks import TerminateOnNaN
callbacks = [TerminateOnNaN()]

In [48]:
from model_decoder import TransformerDecoder
model = TransformerDecoder(
        n_heads=N_HEADS, decoder_layers=N_LAYERS,
        d_model=D_MODEL, vocab_size=VOCAB_SIZE, sequence_len=MAX_SEQUENCE_LEN,
        layer_normalization=True, dropout=True,
        residual_connections=True)

In [49]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 512)     307200      input[0][0]                      
__________________________________________________________________________________________________
positional_encoding_1 (Position (None, 100, 512)     0           embedding[0][0]                  
__________________________________________________________________________________________________
embedding_scalar (Scalar)       (None, 100, 512)     0           positional_encoding_1[0][0]      
__________________________________________________________________________________________________
dropout_1 

In [50]:
# import keras.backend as K
# def loss(y_true, y_pred):
#    return K.categorical_crossentropy(y_true[:,-1:,:], y_pred[:,-1:,:])

In [51]:
loss = 'categorical_crossentropy'

In [52]:
class LRScheduler:
    def __init__(self, d_model, warmup_steps):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.epoch = 1

    def lr(self, epoch):
        lr = self.d_model**-.5 * min(self.epoch**-.5, epoch*(self.warmup_steps**-1.5))
        self.epoch += 1
        return lr
lr_scheduler = LRScheduler(D_MODEL, WARMUP_STEPS)

In [53]:
from keras.callbacks import LearningRateScheduler
# callbacks.append(LearningRateScheduler(lr_scheduler.lr))

In [54]:
from keras.optimizers import adam
model.compile(loss=loss, optimizer=adam(lr=1e-4))

In [None]:
# from keras import backend as K
# old_lr = K.get_value(model.optimizer.lr)
# K.set_value(model.optimizer.lr, 1e-4)

In [None]:
n_epochs = 1000
model.fit_generator(
    train_gen, steps_per_epoch=steps_per_epoch,
    epochs=n_epochs, callbacks=callbacks)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000