# Captal Letters Recognition

In [1]:
import os

import nltk
nltk.download('semcor')
nltk.download('punkt')
nltk.download('perluniprops')
from nltk.corpus import semcor
from nltk.tokenize.moses import MosesDetokenizer
from keras.layers import Embedding, LSTM, GRU, Conv1D, Dense, Bidirectional
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import numpy as np

# detokenization: turnig tokens back into sentences
MDETOK = MosesDetokenizer()

[nltk_data] Downloading package semcor to /Users/aloosley/nltk_data...
[nltk_data]   Package semcor is already up-to-date!
[nltk_data] Downloading package punkt to /Users/aloosley/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/aloosley/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Load Brown / Semcor Corpus, select sentences with lots of capitalized words

In [2]:
def filter_titled_sents(sentences, u_case_min_nb=3):
    '''select only those sentences with at least u_case_min_nb number of words beginning with capital letter'''
    filtered_sents = []
    for sent in sentences:
        nb_titles = 0
        for token in sent:
            if token.istitle():
                nb_titles += 1
        if nb_titles >= u_case_min_nb:
            filtered_sents.append(sent)
    return filtered_sents

In [3]:
sents = semcor.sents()  # loading tokenized sentences from Semcor corpus
print("number of sentences: %s" % len(sents))
sents = filter_titled_sents(sents, u_case_min_nb=3)
print("number of sentences after filtering: %s" % len(sents))
print("sample sentence:")
print(sents[10000])

number of sentences: 37176
number of sentences after filtering: 10031
sample sentence:
['Albert', 'Einstein', 'was', 'quoted', 'as', 'saying', ':', '``', 'The', 'workings', 'of', 'the', 'woman', "'s", 'mind', 'amaze', 'me', "''", '.']


In [4]:
# we are going to keep only shorter sentences
MAX_SEQUENCE_LENGTH = 35
sents = [sent for sent in sents if len(sent) <= MAX_SEQUENCE_LENGTH]
print("number of sentences after filtering: %s" % len(sents))

# and clean the dataset a bit removing tokens like `` first
filter_out_toks = {'``', '\'\''}
sents =[[word for word in sent if word not in filter_out_toks] for sent in sents]

number of sentences after filtering: 7144


## Word Level Models

In this task we are going to use pre-trained "GloVe" word embeddings that can be downloaded from https://nlp.stanford.edu/data/glove.6B.zip

You can have a look at the visualization of pre-trained Word2Vec word embeddings, words with similar meaning are clustered together: http://projector.tensorflow.org/

They used PCA or T-SNE to reduce the dimensionality of the vectors.

In [5]:
# make sentences lowercase
sents_lower =[[word.lower() for word in sent] for sent in sents]
# annotate words in sentences based on their first letter case
capitalization_sent_tags = [[word.istitle() for word in sent] for sent in sents]
print("tokens anotated based on their first letter case:")
print(zip(sents_lower[-500], capitalization_sent_tags[-500]))

tokens anotated based on their first letter case:
<zip object at 0x11086b488>


In [6]:
# create a vocabulary of all words in our dataset
words = set([])
for sent in sents_lower:
    words.update(sent)
print("vocabulary size: %s" % len(words))

vocabulary size: 18482


In [7]:
# create a dictionary, an index for each word
dictionary = dict()
for i, word in enumerate(words):
    dictionary[word] = i
print("index of `hello`: %s" % dictionary["hello"])

# a mapping for indexes back into words
idx2word = {}
for word, i in dictionary.items():
    idx2word[i] = word

index of `hello`: 12954


In [8]:
# convert sentences into sequences of word indexes
sequences = [[dictionary[word] for word in sent] for sent in sents_lower]
print("sequence of word indexes for each sentence: %s" % sequences[0][:10])

sequence of word indexes for each sentence: [2238, 9282, 4107, 4046, 6600, 7949, 4079, 17655, 14533, 16476]


In [9]:
# pad sequences with zeros to make them same length: we need it for vectorized computations
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
data[0]

array([ 2238,  9282,  4107,  4046,  6600,  7949,  4079, 17655, 14533,
       16476,  6267,  1646,  5989, 14612,  6025, 17220,  7450,  1410,
         939, 17078,  1327, 13214,  8863, 14514,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [10]:
# labels will be converted to categories: first indicates the probability of a capitalized word, second a lowercased word 
labels = pad_sequences(capitalization_sent_tags, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = to_categorical(labels)
labels[0][:10]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [11]:
# our dataset will be split into a traing part and a validation part,
# where we measure our model's performance during training,
# this can be done automatically during the Keras model training

# we will further keep a testing part to evaluate predictions 
TEST_SPLIT = .1
nb_test_samples = int(TEST_SPLIT * data.shape[0])
print("number of test samples: %s" % nb_test_samples)

number of test samples: 714


In [12]:
# we want to shuffle the data a bit to split the dataset uniformly
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:-nb_test_samples]
y_train = labels[:-nb_test_samples]
x_test = data[-nb_test_samples:]
y_test = labels[-nb_test_samples:]

In [14]:
# let's create a dictionary of embeddings from each word embedding vector in the pre-trained GloVe embeddings file
GLOVE_DIR = "./glove.6B/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
print("embedding for the word `word`:")
print(embeddings_index.get("word"))

Found 400000 word vectors.
embedding for the word `word`:
[ 0.1233    0.55741   0.74203  -0.06547  -0.33485   0.81541  -0.16384
 -1.0327    0.41834  -0.012764 -0.60695   0.30146   0.35976   0.41161
  0.03381  -0.091115  0.35077  -0.24798  -0.13128   0.19869   0.046961
  0.014633 -0.39851  -0.11829  -0.27432  -0.032518 -0.23637  -0.072372
 -0.04237  -0.11159   0.12129   0.64011  -0.50275  -0.21584   0.30097
 -0.041772 -0.47972  -0.12897   0.6964   -0.27594  -0.29149   0.088033
  0.12874  -0.15249  -0.20548   0.029435  0.055133 -0.12994  -0.33869
 -0.61891   0.4743    0.60288   1.0209    0.48663  -1.0587   -1.9711
 -0.41751   0.12457   1.304     0.26925   0.28003   0.91141  -0.62217
 -0.70356   1.0379   -0.095316  0.54085  -0.36123  -0.10311  -0.31059
 -0.61454   0.63799   0.18329  -0.49599   0.3607    0.70414  -0.28096
  0.1062   -0.64866  -0.28698  -0.26623  -1.4502   -0.69456  -0.48722
 -1.6753    0.40353  -0.085219 -0.85528   0.65113   0.019457 -0.20924
  0.18864  -0.12794   0.41757 

In [15]:
# let's try to extract the GloVe embeddings for each word from our dataset vocabulary
from tqdm import tqdm
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(dictionary) + 1, EMBEDDING_DIM))
for word, i in tqdm(dictionary.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
del embeddings_index

100%|██████████| 18482/18482 [00:00<00:00, 103529.04it/s]


Checking how many words have no pre-trained GloVe word embeddings:

In [16]:
oov_percentage = 100. * np.count_nonzero(np.all(embedding_matrix == 0, axis=1)) / len(dictionary)  # OOV portion
print("percentage of words out of vocabulary: %s percent" % oov_percentage)
outta_vocab_idxs = set(np.where(np.all(embedding_matrix == 0, axis=1))[0])
outta_vocab_words = [word for word, i in dictionary.items() if i in outta_vocab_idxs]
print("examples of words without pre-trained GloVe embeddings:")
print(outta_vocab_words[:15])

percentage of words out of vocabulary: 6.709230602748621 percent
examples of words without pre-trained GloVe embeddings:
['', '37000', 'shoettle', 'esnards', 'future-day', 'hoijer', 'prosopopoeia', 'lagerlo', 'troubie', 'bullyboys', '300000', 'torrid-breeze', 'snobbishly', 'gaafer', 'schraffts']


In [17]:
def print_predictions(x_test, y_pred, idx2word):
    """
    print and detokenize the results of our model's predictions
    y_pred has two output channels, first one giving the probability of the lowercased word,
    the second channel the probability of the capitalized word.
    x_test are word indices of lowercased sentences
    """
    for seq, preds in zip(x_test, y_pred):
        sentence = []
        for word_id, pred in zip(seq, preds):
            if pred[0] > pred[1]:
                sentence.append(idx2word[word_id])
            else:
                sentence.append(idx2word[word_id].capitalize())
        print(MDETOK.detokenize(sentence, return_str=True).strip() + "\n")

### Simple baseline: A single fully connected layer
A Dense/fully connected layer won't remember the order of the words, but we might get some words right.

Embeddings layer will be using the weights from the pre-trained GloVe vectors. We don't want to change them so we set `trainable=False`  

We use softmax acivation, all outputs (2) of our model will sum up to one.

20 % of the dataset will be used for validation.

Number of epochs tells us how many times we want our model to look at all the traing samples in our traing set.

If we set a number of epochs too high and our model is very complex, it will start overfitting the training data and the validation error will start rising.

In [18]:
BATCH_SIZE = 32
model = Sequential()
model.add(Embedding(input_dim=len(dictionary) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))  # Note the embedding_matrix above gives the glove vecs
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_split=0.2, epochs=30, batch_size=BATCH_SIZE)
score, acc = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test accuracy:', acc)

Train on 5144 samples, validate on 1286 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test accuracy: 0.9188075200850222


In [20]:
# let's see what our model predicts
y_pred = model.predict(x_test)
print_predictions(x_test[:50], y_pred[:50], idx2word)

he came by and repeated, Po 'Chavis!

one of the proteases has ph optimum of about 3.7 and another of about 5.7 ( mcquillan, Stanley and trikojus, 1954; alpers, Robbins and rall, 1955).

there, gregorio said, huff Wrote a complete statement of his offense.

the season will open at the new Hall of flowers in Golden gate Park on november 20 at 8: 30 P. M. with a concert by the Mills Chamber players.

thornburg added in a lower voice but Andy overheard, they act more like a jury than an audience.

have you investigated the possibility of moving midweek holidays forward to monday or back to friday in order to have an uninterrupted work week?

three were doubles, Brooks Robinson getting a pair and marv breeding one.

a week later the daily journal had discovered the initial plans of some Providence citizens to hold a meeting honoring John Brown on the day of his execution.

what Hume calls sensation is what Whitehead calls perception in the mode of presentational immediacy which is a sophis

### RNN Model
add a Bidirectional layer of LSTM cells or GRU cells after the Embedding layer: 

`Bidirectional(cells, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))`

you can try experimenting with different `cells` (LSTM, RNN,...):

Try `LSTM(HIDDEN_SIZE_LSTM, return_sequences=...)` or `GRU(HIDDEN_SIZE_LSTM, return_sequences=...)`, can you see any difference in training or accuracy when using different cells?

if we set `return_sequences` True, we will get the output of the cells in each timestep of the sequence, that's what we want:)

you can change the complexity of the model by setting `HIDDEN_SIZE_LSTM` which changes the number of `units`.


In [30]:
HIDDEN_SIZE_LSTM = 100
BATCH_SIZE = 32
model = Sequential()
model.add(Embedding(input_dim=len(dictionary) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Bidirectional(LSTM(HIDDEN_SIZE_LSTM, return_sequences=True), input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=BATCH_SIZE)
score, acc = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test accuracy:', acc)

Train on 5144 samples, validate on 1286 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.9729491859590974


In [31]:
# let's see what our model predicts
y_pred = model.predict(x_test)
print_predictions(x_test[:50], y_pred[:50], idx2word)

He came by and repeated, Po 'Chavis!

One of the proteases has ph optimum of about 3.7 and another of about 5.7 ( Mcquillan, Stanley and Trikojus, 1954; Alpers, Robbins and Rall, 1955).

There, Gregorio said, Huff wrote a complete statement of his offense.

The season will open at the new hall of flowers in Golden Gate Park on November 20 at 8: 30 p. m. with a concert by the Mills Chamber players.

Thornburg added in a lower voice but Andy overheard, they act more like a jury than an audience.

Have you investigated the possibility of moving midweek holidays forward to Monday or back to Friday in order to have an uninterrupted work week?

Three were doubles, Brooks Robinson getting a pair and Marv breeding one.

A week later the daily Journal had discovered the initial plans of some Providence citizens to hold a meeting honoring John Brown on the day of his execution.

What Hume calls sensation is what Whitehead calls perception in the mode of presentational immediacy which is a sophis

### CNN Model

Use Conv1d instead of RNN layers:

`Conv1D(filters=EMBEDDING_DIM, kernel_size=..., activation='relu', padding=...)`

We need to preserve the sequence legth, when going from one layer to another, so we set padding='same'

kernel_size (window size) is a parameter setting the scope of view for our convolutional filter, how many words we look at.

We want a filter for each index of our word embedding vector

Try setting padding='causal'. This will make our window (kernel) wider, but we will look only at every other word in a sequence inside the window. This is also called a dilated convolution.




In [34]:
BATCH_SIZE = 32
WINDOW_SIZES = [3, 3]
model = Sequential()
model.add(Embedding(input_dim=len(dictionary) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH, trainable=False))
# TODO: Try experimenting...adding some 1D Convolutinal layers here.
model.add(Conv1D(filters=EMBEDDING_DIM, kernel_size=5, activation='relu', padding='same'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=BATCH_SIZE)
score, acc = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print('Test accuracy:', acc)

Train on 5144 samples, validate on 1286 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.9690676318163297


In [35]:
# let's see what our model predicts
y_pred = model.predict(x_test)
print_predictions(x_test[:50], y_pred[:50], idx2word)

He came by and repeated, Po 'Chavis!

One of the proteases has ph optimum of about 3.7 and another of about 5.7 ( Mcquillan, Stanley and Trikojus, 1954; Alpers, Robbins and Rall, 1955).

There, Gregorio said, Huff wrote a complete statement of his offense.

The season will open at the new hall of flowers in Golden Gate Park on November 20 at 8: 30 p. m. with a concert by the Mills Chamber players.

Thornburg added in a lower voice but Andy overheard, they act more like a jury than an audience.

Have you investigated the possibility of moving midweek holidays forward to Monday or back to Friday in order to have an uninterrupted work week?

Three were doubles, Brooks Robinson getting a pair and Marv breeding one.

A week later the daily Journal had discovered the initial plans of some Providence citizens to hold a meeting honoring John Brown on the day of his execution.

What Hume calls sensation is what Whitehead calls perception in the mode of presentational immediacy which is a sophis

## Character Level Models

We are now going to work with models that look at each letter of the text, deciding whether it should be big or small.
The strategy can be the same as with the words, but now we have no pre-trained vector embeddings, so we initialize them randomly and allow their training by setting trainable=True.

In [36]:
HIDDEN_SIZE_LSTM = 100
EMBEDDING_DIM = 20
BATCH_SIZE = 32
WIN_SIZE = 30

In [37]:
# our dataset will be split into a traing part and a validation part, where we measure our model's performance
VALIDATION_SPLIT = .2
nb_validation_samples = int(VALIDATION_SPLIT * len(sents))
indices = np.arange(len(sents))
np.random.shuffle(indices)
train_indices = indices[:-nb_validation_samples]
val_indices = indices[-nb_validation_samples:]


whole_sents_train = [MDETOK.detokenize(sents[index], return_str=True) for index in train_indices]
whole_sents_val = [MDETOK.detokenize(sents[index], return_str=True) for index in val_indices]
whole_sents_lower_train = [sent.lower() for sent in whole_sents_train]
whole_sents_lower_val = [sent.lower() for sent in whole_sents_val]

whole_text_train = " ".join(whole_sents_train)
whole_text_val = " ".join(whole_sents_val)
whole_text_lower_train = whole_text_train.lower()
whole_text_lower_val = whole_text_val.lower()

# create a vocabulary
chars_vocab = set(whole_text_lower_train).union(whole_text_lower_val)
print("vocabulary size: %s" % len(chars_vocab))

# create a dictionary, an index for each character
chars_dictionary = dict()
for i, char in enumerate(chars_vocab):
    chars_dictionary[char] = i

# a mapping for indexes back into chars
idx2char = {}
for char, i in chars_dictionary.items():
    idx2char[i] = char

vocabulary size: 59


### Looking at the whole sentence characters in a single prediction

In [38]:
capitalization_char_tags_train = [[char.isupper() for char in sent] for sent in whole_sents_train]
capitalization_char_tags_val = [[char.isupper() for char in sent] for sent in whole_sents_val]

# convert sentences into sequences of character indexes
sequences_train = [[chars_dictionary[char] for char in sent] for sent in whole_sents_lower_train]
sequences_val = [[chars_dictionary[char] for char in sent] for sent in whole_sents_lower_val]

In [39]:
MAX_SEQUENCE_LENGTH = 200

data_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=chars_dictionary[" "])
data_val = pad_sequences(sequences_val, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=chars_dictionary[" "])

labels_train = pad_sequences(capitalization_char_tags_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=chars_dictionary[" "])[:,:,np.newaxis]
labels_val = pad_sequences(capitalization_char_tags_val, maxlen=MAX_SEQUENCE_LENGTH, padding='post', value=chars_dictionary[" "])[:,:,np.newaxis]

#### RNN Model

In [None]:
HIDDEN_SIZE_LSTM = 100
BATCH_SIZE = 32
model = Sequential()
model.add(Embedding(input_dim=len(chars_dictionary) + 1, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model.add(Bidirectional(LSTM(HIDDEN_SIZE_LSTM, return_sequences=True), input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(data_train, labels_train, validation_data=(data_val, labels_val), epochs=10, batch_size=BATCH_SIZE)

Train on 5716 samples, validate on 1428 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

### CNN model
let's try to build a similar model like the one we used for words above, only keeping allowing the Embeddings to be trainable this time :).

In [None]:
# TODO implement a character level CNN model, notice that now we output a single value as our prediction
# before our output was categorical, one output for lowercase, one for uppercase

In [None]:
def print_char_predictions(x_test, y_pred, idx2char):
    """
    print the results of our model's predictions
    y_pred is a number between zero and one, a probability of the letter being Uppercase.
    x_test are char indices of lowercased sentences
    """
    for seq, preds in zip(x_test, y_pred):
        sentence = []
        for char_id, pred in zip(seq, preds):
            if pred > .5:
                sentence.append(idx2char[char_id].upper())
            else:
                sentence.append(idx2char[char_id])
        print("".join(sentence) + "\n")

In [None]:
y_pred = model.predict(data_val)
print_char_predictions(data_val, y_pred, idx2char)

### Using a smaller sliding window on a continuous text
up to now, the sequences of characters were quite long (hundreds of characters)...We can have a look at continuous text and create a window, that will limit our scope to say 30 characters. We are going to slide the window one character forwards and make predictions in each of them.

What to do with the excess predictions from overlapping windows?

Let's just take the prediction from the middle of the window. So that we cover enough context from both sides.

In [None]:
capitalization_char_tags_train = [char.isupper() for char in whole_text_train]
capitalization_char_tags_val = [char.isupper() for char in whole_text_val]

# convert sentences into sequences of character indexes
sequence_train = [chars_dictionary[char] for char in whole_text_lower_train]
sequence_val = [chars_dictionary[char] for char in whole_text_lower_val]

In [None]:
def window_batch_generator(sequence, labels, win_size=30, batch_size=32):
    """yield batches of sequences and lables in windows, each window sliding one step-ahead"""
    x_batch, y_batch = [], []
    while True:
        for i in range(0, len(sequence) - win_size):
            if len(x_batch) == batch_size:
                yield np.array(x_batch), np.array(y_batch, dtype="float64")[:,:,np.newaxis]
                x_batch, y_batch = [], []
            x_batch.append(sequence[i:i + win_size])
            y_batch.append(labels[i:i + win_size])
        if len(x_batch) != 0:
            yield np.array(x_batch), np.array(y_batch, dtype="float64")[:,:,np.newaxis]

#### RNN Model

In [None]:
data_gen_train = window_batch_generator(sequence_train, capitalization_char_tags_train)
data_gen_val = window_batch_generator(sequence_val, capitalization_char_tags_val)

we will now provide our Keras model with a Generator that is going to collect samples batch by batch

the generator is "bottom-less" repeating it's iteration after we go through the whole text (EPOCH)
we will calculate the epoch size to let our model know when to go to next epoch:

In [None]:
steps_per_epoch_train = (len(sequence_train) - WIN_SIZE) / BATCH_SIZE
steps_per_epoch_val = (len(sequence_val) - WIN_SIZE) / BATCH_SIZE

In [None]:
HIDDEN_SIZE_GRU = 10
EMBEDDING_DIM = 20
BATCH_SIZE = 32
WIN_SIZE = 30
NB_EPOCHS = 1
model = Sequential()
model.add(Embedding(input_dim=len(chars_dictionary) + 1, output_dim=EMBEDDING_DIM, input_length=WIN_SIZE, trainable=True))
model.add(Bidirectional(GRU(HIDDEN_SIZE_GRU, return_sequences=True), input_shape=(WIN_SIZE, EMBEDDING_DIM)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(data_gen_train, validation_data=data_gen_val,
                    epochs=NB_EPOCHS, steps_per_epoch=steps_per_epoch_train, validation_steps=steps_per_epoch_val)

*We can take each overlapping window prediction and extract only its middle part to make sure we cover the neighboring characters from both sides:*

In [None]:
def print_mid_window_predictions(batch_samples, model, idx2char):
    """
    print the results of our model's predictions
    take only the prediction from the middle of a window.
    """
    win_size = test_samples[0].shape[1]
    mid_win_idx = win_size / 2
    extacted_text = []
    for batch in test_samples:
        predictions = model.predict_on_batch(batch)
        for chars, preds in zip(batch, predictions):
            if preds[mid_win_idx] > .5:
                extacted_text.append(idx2char[chars[mid_win_idx]].upper())
            else:
                extacted_text.append(idx2char[chars[mid_win_idx]])
    print("".join(extacted_text))

In [None]:
steps = 50
test_samples = [data_gen_val.next()[0] for _ in range(steps)]  # we extract some test samples from our validation set 

In [None]:
print_mid_window_predictions(test_samples, model, idx2char)

####  Predicting the first letter in a window only by learning to read backwards :D...we won't be able to recognize the beginnings of sentences, but perheaps we might succeed at catching some names, lets give it a try:

In [None]:
def window_batch_generator_first_letter_out(sequence, labels, win_size=30, batch_size=32):
    """this generator only outputs a single label for each window, a label of our first character"""
    x_batch, y_batch = [], []
    while True:
        for i in range(0, len(sequence) - win_size):
            if len(x_batch) == batch_size:
                yield np.array(x_batch), np.array(y_batch, dtype="float64")[:, np.newaxis]
                x_batch, y_batch = [], []
            x_batch.append(sequence[i:i + win_size])
            y_batch.append(labels[i])
        if len(x_batch) != 0:
            yield np.array(x_batch), np.array(y_batch, dtype="float64")[:, np.newaxis]

In [None]:
data_gen_train = window_batch_generator_first_letter_out(sequence_train, capitalization_char_tags_train)
data_gen_val = window_batch_generator_first_letter_out(sequence_val, capitalization_char_tags_val)

Now we are only making a prediction from the final RNN cell's output (last timestep) by setting return_sequences=False.

Setting go_backwards=True is going to revert the order of the sequence when passing it to RNN.

When training, we are giving only a single example (label) for the whole sequence of characters. This is similar to the task of IMDB movie sentiment. In the models before, we had labels for each letter in the sequence. Could it more difficult to train the model now as it sees less information?

In [None]:
HIDDEN_SIZE_GRU = 10
EMBEDDING_DIM = 20
BATCH_SIZE = 32
WIN_SIZE = 30
NB_EPOCHS = 3
model = Sequential()
model.add(Embedding(input_dim=len(chars_dictionary) + 1, output_dim=EMBEDDING_DIM, input_length=WIN_SIZE, trainable=True))
model.add(GRU(HIDDEN_SIZE_GRU, return_sequences=False, go_backwards=True, input_shape=(WIN_SIZE, EMBEDDING_DIM)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit_generator(data_gen_train, validation_data=data_gen_val,
                    epochs=NB_EPOCHS, steps_per_epoch=steps_per_epoch_train, validation_steps=steps_per_epoch_val)

In [None]:
def print_first_letter_predictions(batch_samples, model, idx2char):
    """
    print the results of our model's predictions
    take only the first letter prediction
    """
    extacted_text = []
    for batch in test_samples:
        predictions = model.predict_on_batch(batch)
        for chars, pred in zip(batch, predictions):
            if pred > .5:
                extacted_text.append(idx2char[chars[0]].upper())
            else:
                extacted_text.append(idx2char[chars[0]])
    print("".join(extacted_text))

In [None]:
steps = 50
test_samples = [data_gen_val.next()[0] for _ in range(steps)]  # we extract some test samples from our validation set
print_first_letter_predictions(test_samples, model, idx2char)