In [149]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

import re
import string

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
nltk.download('book')

[nltk_data] Downloading collection u'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/eric/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to /home/eric/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to /home/eric/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/eric/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /home/eric/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /home/eric/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |     /home/eric/nltk_data...
[nltk_data]    |   Unzipping corpora/dependen

True

***

In [67]:
vocabulary_size = 1500
unknown_token = 'UNKNOWN_TOKEN'
sentence_start_token = 'SENTENCE_START'
sentence_end_token = 'SENTENCE_END'

In [30]:
text = []
f = open('wizOfOz.txt', 'r')
#Read corpus into text as list of lines
ast_count = 0
for line in f.readlines():
    if ast_count == 0: #Gutenberg text
        if line[0] == '*':
            ast_count += 1
    elif ast_count == 1: #Book text
        if line[0] == '*': #Gutenberg text
            ast_count += 1
        else:
            text.append(line)
f.close()

#Form corpus by joining list of lines
corpus = ''.join(text)

#Remove line breaks and returns
corpus = corpus.replace('\n', ' ')
corpus = corpus.replace('\r', '')
corpus = corpus.replace('\\', '')

#Start from the introduction, the second occurrence of the word Introduction
iter = re.finditer(r'Introduction', corpus)
intro_indices = [m.start(0) for m in iter]
corp = corpus[intro_indices[1]:]

In [40]:
#Split corpus in to sentences
sentences = nltk.sent_tokenize(corp.decode('utf-8').lower())

In [44]:
#Append SENTENCE_START and SENTENCE_END
sentences = ['%s %s %s' % (sentence_start_token, x, sentence_end_token) for x in sentences]

In [47]:
print 'Parsed %d sentences' % (len(sentences))

Parsed 2223 sentences


In [49]:
#Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [53]:
#Count word frequency
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print 'Found %d unique word tokens' % len(word_freq.items())

Found 2959 unique word tokens


In [68]:
#Get the most common words and build idx2word and word2idx vectors
vocab = word_freq.most_common(vocabulary_size-1)
idx2word = [x[0] for x in vocab]
idx2word.append(unknown_token)
word2idx = dict([(w,i) for i,w in enumerate(idx2word)])
print 'Using vocabulary size %d' % vocabulary_size
print 'The least frequent word in our vocabulary is "%s" appearing %d times' % (vocab[-1][0], vocab[-1][1])

Using vocabulary size 1500
The least frequent word in our vocabulary is "hurry" appearing 2 times


In [75]:
#Replace all words not in our vocabulary with the unknown token
for i,sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word2idx else unknown_token for w in sent]

In [76]:
temp = np.random.randint(0, len(sentences))
print 'Example sentence: "%s"\nExample sentence after pre-processing "%s"' % (sentences[temp], tokenized_sentences[temp])

Example sentence: "SENTENCE_START she was dressed in green silk gauze and wore upon her flowing green locks a crown of jewels. SENTENCE_END"
Example sentence after pre-processing "[u'SENTENCE_START', u'she', u'was', u'dressed', u'in', u'green', u'silk', 'UNKNOWN_TOKEN', u'and', u'wore', u'upon', u'her', u'flowing', u'green', 'UNKNOWN_TOKEN', u'a', 'UNKNOWN_TOKEN', u'of', u'jewels', u'.', u'SENTENCE_END']"


***

In [84]:
#Create the training data
X_train = np.asarray([[word2idx[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word2idx[w] for w in sent[1:]] for sent in tokenized_sentences])

In [88]:
#Training data example
temp = np.random.randint(0, len(sentences))
x_example, y_example = X_train[temp], y_train[temp]
print 'x:\n%s\n%s' % (' '.join([idx2word[x] for x in x_example]), x_example)
print 'y:\n%s\n%s' % (' '.join([idx2word[x] for x in y_example]), y_example)

x:
SENTENCE_START but he at once unlocked their spectacles , which he put back into the green box , and gave them many good wishes to carry with them .
[3, 26, 15, 31, 131, 1152, 73, 492, 1, 112, 15, 183, 74, 97, 0, 72, 806, 1, 5, 195, 33, 138, 101, 693, 6, 221, 28, 33, 4]
y:
but he at once unlocked their spectacles , which he put back into the green box , and gave them many good wishes to carry with them . SENTENCE_END
[26, 15, 31, 131, 1152, 73, 492, 1, 112, 15, 183, 74, 97, 0, 72, 806, 1, 5, 195, 33, 138, 101, 693, 6, 221, 28, 33, 4, 2]


$s_t = $tanh$(Ux_t + Ws_{t-1})$  
$o_t = $softmax$(Vs_t)$  
Let our hidden layer have size $H = 100$  
$x_t \in \mathbb{R}^{1500}$  
$o_t \in \mathbb{R}^{1500}$  
$s_t \in \mathbb{R}^{100}$  
$U \in \mathbb{R}^{100 \times 1500}$  
$V \in \mathbb{R}^{1500 \times 100}$  
$W \in \mathbb{R}^{100 \times 100}$

#### Initialization

In [89]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        #Random initialization of network parameters in the range [-n**(-1/2), n**(-1/2)]
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

#### Forward Propagation

In [105]:
def np_softmax(x):
    return np.exp(x) / np.exp(x).sum()

In [106]:
def forward_propagation(self, x):
    #Total number of time steps
    T = len(x)
    #Save hidden states in s for later
    #Add one additional element for the initial hidden state, set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    #Outputs at each time step are saved for later
    o = np.zeros((T, self.word_dim))
    #At each time step
    for t in np.arange(T):
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = np_softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [107]:
def predict(self, x):
    #Perform forward propagation and return index of highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [109]:
#Example output
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print o.shape
print o

(50, 1500)
[[ 0.00067235  0.00066777  0.00066659 ...,  0.00066377  0.00066365
   0.00067008]
 [ 0.00067458  0.00067966  0.00066873 ...,  0.00068018  0.00065385
   0.00066934]
 [ 0.00066685  0.00065744  0.00067244 ...,  0.00067043  0.0006741
   0.00064967]
 ..., 
 [ 0.00066946  0.00067601  0.00066425 ...,  0.00066204  0.00067731
   0.0006514 ]
 [ 0.00067488  0.00066734  0.00066883 ...,  0.00067339  0.00065754
   0.00066401]
 [ 0.00066567  0.00066278  0.00066482 ...,  0.00066243  0.0006694
   0.0006709 ]]


In [110]:
predictions = model.predict(X_train[10])
print predictions.shape
print predictions

(50,)
[ 979  397  529  296 1398  156  203  769  764 1160  740 1428 1123 1042  219
  822  506  660  691 1071 1270  475 1042 1155  957  974  381  608 1157  813
  726   46  827  225  830 1096  294  972 1412  769 1133  465  241  228  199
 1398 1374  865  132 1257]


In [112]:
' '.join([idx2word[i] for i in predictions])

u"mishap dear strong each usually went journey almost gently bounded children oats hungry path three stretched covered rolled 've suggested gingham without path calmly dreadful pleasant kill bottom strip pulled lonely do seems cried chop managed bed beyond troubles almost dangers best small sure next usually dotted presently thought loss"

#### Calculating Loss

In [114]:
def calculate_total_loss(self, x, y):
    L = 0
    #For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        #Determine the correct word predictions
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        #Add to the loss based on the cross-entropy each term
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    #Divide total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [122]:
#Loss for 1000 examples
print 'Expected loss for random predictions: %f' % np.log(vocabulary_size)
print 'Actual loss: %f' % model.calculate_loss(X_train[:1000], y_train[:1000])

Expected loss for random predictions: 7.313220
Actual loss: 7.313597


#### Training the RNN through SGD and BPTT

In [124]:
def bptt(self, x, y):
    T = len(y)
    #Perform forward propagation
    o, s = self.forward_propagation(x)
    #Gradient accumulators
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1
    #For each output, backwards ...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        #Initial delta
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        #Backpropagation through time (for at most self.bptt truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            #print 'Backpropagation step t=%d bptt step=%d' % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])
            dLdU[:,x[bptt_step]] += delta_t
            #Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1]**2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

#### Gradient Checking

In [125]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    #Calculate gradients using backpropagation and check whether that are correct
    bptt_gradients = model.bptt(x, y)
    #Parameters to check
    model_parameters = ['U', 'V', 'W']
    for pidx, pname in enumerate(model_parameters):
        #Get actual parameter value from the model
        parameter = operator.attrgetter(pname)(self)
        print 'Performing gradient check for parameter %s with size %d' % (pname, np.prod(parameter.shape))
        #Iterate over each element of the parameter matrix
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            #Save original value to be reset to later
            original_value = parameter[ix]
            #Estimate the gradients as (f(x+h) - f(x-h)) / (2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x], [y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x], [y])
            estimated_gradient = (gradplus - gradminus) / (2*h)
            #Reset parameter to original value
            parameter[ix] = original_value
            #Calculate the gradient for this parameter with backprop
            backprop_gradient = bptt_gradients[pidx][ix]
            #Calculate the relative error (|x - y| / (|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            #If the error is too large, then fail the gradient check
            if relative_error > error_threshold:
                print 'Gradient check ERROR: parameter=%s ix=%s' % (pname, ix)
                print '+h loss: %f' % gradplus
                print '-h loss: %f' % gradminus
                print 'Estimated gradient: %f' % estimated_gradient
                print 'Backpropagation gradient: %f' % backprop_gradient
                print 'Relative Error: %f' % relative_error
                return
            it.iternext()
        print 'Gradient check for parameter %s passed' % pname
        
RNNNumpy.gradient_check = gradient_check

In [127]:
#Check with smaller vocabulary size
grad_check_vocab_size = 100
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000
Gradient check for parameter U passed
Performing gradient check for parameter V with size 1000
Gradient check for parameter V passed
Performing gradient check for parameter W with size 100
Gradient check for parameter W passed


#### SGD Implementation

Two step implementation:  
1. `sdg_step` calculates the gradients and performs updates for one batch  
2. Outer loop that iterates through the training set and adjusts the learning rate

In [129]:
def numpy_sgd_step(self, x, y, learning_rate):
    #Calculate gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    #Change parameters according to the gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW
    
RNNNumpy.sgd_step = numpy_sgd_step

In [130]:
def train_with_sdg(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    """Outer SGD Loop
    params:
    X_train: training data set
    y_train: training data targets
    learning_rate: initial learning rate for SGD
    nepoch: number of times to iterate throught the complete data set
    evaluate_loss_after: evaluate the loss after this many epochs"""
    #Keep track of losses
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        #Optionally evaluate loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print '%s: Loss after num_examples_seen=%d epoch=%d: %f' % (time, num_examples_seen, epoch, loss)
            #Adjust the learning rate, if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5
                print 'Setting learning rate to %f' % learning_rate
            sys.stdout.flush()
        #For each training example ...
        for i in range(len(y_train)):
            #One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [131]:
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

10 loops, best of 3: 38.5 ms per loop


In [135]:
#Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = train_with_sdg(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)

2018-12-26 16:05:20: Loss after num_examples_seen=0 epoch=0: 7.313384
2018-12-26 16:05:22: Loss after num_examples_seen=100 epoch=1: 7.296847
2018-12-26 16:05:25: Loss after num_examples_seen=200 epoch=2: 6.163766
2018-12-26 16:05:27: Loss after num_examples_seen=300 epoch=3: 5.445909
2018-12-26 16:05:29: Loss after num_examples_seen=400 epoch=4: 5.312528
2018-12-26 16:05:31: Loss after num_examples_seen=500 epoch=5: 5.231207
2018-12-26 16:05:33: Loss after num_examples_seen=600 epoch=6: 5.155260
2018-12-26 16:05:35: Loss after num_examples_seen=700 epoch=7: 5.094630
2018-12-26 16:05:37: Loss after num_examples_seen=800 epoch=8: 5.057536
2018-12-26 16:05:39: Loss after num_examples_seen=900 epoch=9: 5.033748


#### Training the Network with Theano and the GPU

In [140]:
import theano as theano
import theano.tensor as T

In [141]:
class RNNTheano:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        # Theano: Created shared variables
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))      
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()
    
    def __theano_build__(self):
        U, V, W = self.U, self.V, self.W
        x = T.ivector('x')
        y = T.ivector('y')
        def forward_prop_step(x_t, s_t_prev, U, V, W):
            s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))
            o_t = T.nnet.softmax(V.dot(s_t))
            return [o_t[0], s_t]
        [o,s], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))],
            non_sequences=[U, V, W],
            truncate_gradient=self.bptt_truncate,
            strict=True)
        
        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        
        # Gradients
        dU = T.grad(o_error, U)
        dV = T.grad(o_error, V)
        dW = T.grad(o_error, W)
        
        # Assign functions
        self.forward_propagation = theano.function([x], o)
        self.predict = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], o_error)
        self.bptt = theano.function([x, y], [dU, dV, dW])
        
        # SGD
        learning_rate = T.scalar('learning_rate')
        self.sgd_step = theano.function([x,y,learning_rate], [], 
                      updates=[(self.U, self.U - learning_rate * dU),
                              (self.V, self.V - learning_rate * dV),
                              (self.W, self.W - learning_rate * dW)])
    
    def calculate_total_loss(self, X, Y):
        return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
    
    def calculate_loss(self, X, Y):
        # Divide calculate_loss by the number of words
        num_words = np.sum([len(y) for y in Y])
        return self.calculate_total_loss(X,Y)/float(num_words)   


def gradient_check_theano(model, x, y, h=0.001, error_threshold=0.01):
    # Overwrite the bptt attribute. We need to backpropagate all the way to get the correct gradient
    model.bptt_truncate = 1000
    # Calculate the gradients using backprop
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to chec.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter_T = operator.attrgetter(pname)(model)
        parameter = parameter_T.get_value()
        print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            parameter_T.set_value(parameter)
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            parameter_T.set_value(parameter)
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            parameter[ix] = original_value
            parameter_T.set_value(parameter)
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
                print "+h Loss: %f" % gradplus
                print "-h Loss: %f" % gradminus
                print "Estimated_gradient: %f" % estimated_gradient
                print "Backpropagation gradient: %f" % backprop_gradient
                print "Relative Error: %f" % relative_error
                return 
            it.iternext()
        print "Gradient check for parameter %s passed." % (pname)

In [142]:
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 50.
Gradient check for parameter U passed.
Performing gradient check for parameter V with size 50.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [143]:
model = RNNTheano(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

10 loops, best of 3: 17.5 ms per loop


***

In [148]:
model = RNNTheano(vocabulary_size, hidden_dim=50)
losses = train_with_sdg(model, X_train, y_train, nepoch=100, evaluate_loss_after=10)

2018-12-26 16:18:58: Loss after num_examples_seen=0 epoch=0: 7.313176
2018-12-26 16:20:33: Loss after num_examples_seen=22230 epoch=10: 5.964751
2018-12-26 16:22:08: Loss after num_examples_seen=44460 epoch=20: 5.781374
2018-12-26 16:23:42: Loss after num_examples_seen=66690 epoch=30: 5.751034
2018-12-26 16:25:16: Loss after num_examples_seen=88920 epoch=40: 5.843756
Setting learning rate to 0.002500
2018-12-26 16:26:50: Loss after num_examples_seen=111150 epoch=50: 5.720955
2018-12-26 16:28:25: Loss after num_examples_seen=133380 epoch=60: 5.755570
Setting learning rate to 0.001250
2018-12-26 16:30:00: Loss after num_examples_seen=155610 epoch=70: 5.741841
2018-12-26 16:31:35: Loss after num_examples_seen=177840 epoch=80: 5.756512
Setting learning rate to 0.000625
2018-12-26 16:33:10: Loss after num_examples_seen=200070 epoch=90: 5.773105
Setting learning rate to 0.000313


Takes about 15 minutes, but reached loss of 5.7 after 50 epochs and 8 minutes

In [164]:
save_model_parameters_theano('data/trained_model_theano.npz', model)
#load_model_parameters_theano('data/trained_model_theano.npz', model)

Saved model parameters to data/trained_model_theano.npz.


#### Generating Text

In [167]:
def generate_sentence(model):
    #Begin with the start token
    new_sentence = [word2idx[sentence_start_token]]
    #Repeat until we get an end token
    while not new_sentence[-1] == word2idx[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word2idx[unknown_token]
        #We don't want to sample unknown words
        while sampled_word == word2idx[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [idx2word[x] for x in new_sentence[1:-1]]
    return sentence_str

In [168]:
num_sentences = 10
sentence_min_length = 7

for i in range(num_sentences):
    sent = []
    #We want long sentences
    while len(sent) < sentence_min_length:
        sent = generate_sentence(model)
    print ' '.join(sent)

asked dorothy saved the tin woodman .
good-bye the scarecrow woodman the north . ''
asked the lion at that woodman watched .
'' said the little happen wife .
asked the tin woodman belonged him close she not both .
asked it crops gorgeous it to head his frighten .
asked dorothy 's dare quite weep and a little wizard .
`` show to you groan ? ''
'' cried same gratefully how grieved surely her place . ''
cried dorothy been their wall munchkin wishes him and many country .


Pros:  
* Learned to end sentences with punctuation  
* Learned some adjectives that appear together often, eg tin woodman, little wizard  

Cons:  
* Quotation marks do not always end and do not make contextual sense
* Did not learn commas *Note* they do show up but are infrequent

In [170]:
num_sentences = 20
sentence_min_length = 10

for i in range(num_sentences):
    sent = []
    #We want long sentences
    while len(sent) < sentence_min_length:
        sent = generate_sentence(model)
    print ' '.join(sent)

asked the scarecrow tin today water once were farther .
dorothy everything , dear him with a be witch .
'' replied timidly fastened quiet dear seeing step reason picked guardian horses lady dried higher easy words wept straight experience loved sigh reach woman willingly modern child guardian crossed meet beyond killing greatly led princess now growing yes arose suppose now flock doorway gates offered animal returned certainly pulled among if fresh disappeared rubies touched dropped follow welcome discovered ashamed man joyfully bark battle bit fairy gravely will wink simply word maiden yet angrily say forced spectacles wipe forward brook understand mind n't simply allow willing handsome happened answer exclaimed without finally fly indeed thanking winkies munchkin scampered can marched mark bears mind respect yes fight yes blocks pick safely scampered ruler clean manner hear awakened awake ropes exclaimed amuse chattering strangers belonged quick might uncle uncomfortable perhaps show

There is one enormous sentence!