In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

import re
import string

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#nltk.download('book')

***

In [4]:
vocabulary_size = 1500
unknown_token = 'UNKNOWN_TOKEN'
sentence_start_token = 'SENTENCE_START'
sentence_end_token = 'SENTENCE_END'

In [5]:
text = []
f = open('wizOfOz.txt', 'r')
#Read corpus into text as list of lines
ast_count = 0
for line in f.readlines():
    if ast_count == 0: #Gutenberg text
        if line[0] == '*':
            ast_count += 1
    elif ast_count == 1: #Book text
        if line[0] == '*': #Gutenberg text
            ast_count += 1
        else:
            text.append(line)
f.close()

#Form corpus by joining list of lines
corpus = ''.join(text)

#Remove line breaks and returns
corpus = corpus.replace('\n', ' ')
corpus = corpus.replace('\r', '')
corpus = corpus.replace('\\', '')

#Start from the introduction, the second occurrence of the word Introduction
iter = re.finditer(r'Introduction', corpus)
intro_indices = [m.start(0) for m in iter]
corp = corpus[intro_indices[1]:]

In [6]:
#Split corpus in to sentences
sentences = nltk.sent_tokenize(corp.decode('utf-8').lower())

In [7]:
#Append SENTENCE_START and SENTENCE_END
sentences = ['%s %s %s' % (sentence_start_token, x, sentence_end_token) for x in sentences]

In [8]:
print 'Parsed %d sentences' % (len(sentences))

Parsed 2223 sentences


In [9]:
#Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [10]:
#Count word frequency
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print 'Found %d unique word tokens' % len(word_freq.items())

Found 2959 unique word tokens


In [11]:
#Get the most common words and build idx2word and word2idx vectors
vocab = word_freq.most_common(vocabulary_size-1)
idx2word = [x[0] for x in vocab]
idx2word.append(unknown_token)
word2idx = dict([(w,i) for i,w in enumerate(idx2word)])
print 'Using vocabulary size %d' % vocabulary_size
print 'The least frequent word in our vocabulary is "%s" appearing %d times' % (vocab[-1][0], vocab[-1][1])

Using vocabulary size 1500
The least frequent word in our vocabulary is "hurry" appearing 2 times


In [12]:
#Replace all words not in our vocabulary with the unknown token
for i,sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word2idx else unknown_token for w in sent]

In [13]:
temp = np.random.randint(0, len(sentences))
print 'Example sentence: "%s"\nExample sentence after pre-processing "%s"' % (sentences[temp], tokenized_sentences[temp])

Example sentence: "SENTENCE_START and if he is the great head, he will be at my mercy; for i will roll this head all about the room until he promises to give us what we desire. SENTENCE_END"
Example sentence after pre-processing "[u'SENTENCE_START', u'and', u'if', u'he', u'is', u'the', u'great', u'head', u',', u'he', u'will', u'be', u'at', u'my', 'UNKNOWN_TOKEN', u';', u'for', u'i', u'will', u'roll', u'this', u'head', u'all', u'about', u'the', u'room', u'until', u'he', u'promises', u'to', u'give', u'us', u'what', u'we', u'desire', u'.', u'SENTENCE_END']"


***

In [14]:
#Create the training data
X_train = np.asarray([[word2idx[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word2idx[w] for w in sent[1:]] for sent in tokenized_sentences])

In [15]:
#Training data example
temp = np.random.randint(0, len(sentences))
x_example, y_example = X_train[temp], y_train[temp]
print 'x:\n%s\n%s' % (' '.join([idx2word[x] for x in x_example]), x_example)
print 'y:\n%s\n%s' % (' '.join([idx2word[x] for x in y_example]), y_example)

x:
SENTENCE_START `` this funny tin man , '' she answered , `` killed the wildcat and saved my life .
[3, 7, 52, 992, 55, 100, 1, 8, 19, 98, 1, 7, 327, 0, 774, 5, 648, 40, 337, 4]
y:
`` this funny tin man , '' she answered , `` killed the wildcat and saved my life . SENTENCE_END
[7, 52, 992, 55, 100, 1, 8, 19, 98, 1, 7, 327, 0, 774, 5, 648, 40, 337, 4, 2]


$s_t = $tanh$(Ux_t + Ws_{t-1})$  
$o_t = $softmax$(Vs_t)$  
Let our hidden layer have size $H = 100$  
$x_t \in \mathbb{R}^{1500}$  
$o_t \in \mathbb{R}^{1500}$  
$s_t \in \mathbb{R}^{100}$  
$U \in \mathbb{R}^{100 \times 1500}$  
$V \in \mathbb{R}^{1500 \times 100}$  
$W \in \mathbb{R}^{100 \times 100}$

#### Initialization

In [16]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        #Random initialization of network parameters in the range [-n**(-1/2), n**(-1/2)]
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

#### Forward Propagation

In [17]:
def np_softmax(x):
    return np.exp(x) / np.exp(x).sum()

In [18]:
def forward_propagation(self, x):
    #Total number of time steps
    T = len(x)
    #Save hidden states in s for later
    #Add one additional element for the initial hidden state, set to 0
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    #Outputs at each time step are saved for later
    o = np.zeros((T, self.word_dim))
    #At each time step
    for t in np.arange(T):
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = np_softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [19]:
def predict(self, x):
    #Perform forward propagation and return index of highest score
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [20]:
#Example output
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print o.shape
print o

(50, 1500)
[[ 0.00067586  0.00067626  0.00067335 ...,  0.00066474  0.0006725
   0.00066804]
 [ 0.0006685   0.00066554  0.00066936 ...,  0.00066188  0.0006709
   0.00067597]
 [ 0.00066997  0.00066798  0.00065605 ...,  0.00067054  0.00065682
   0.00066726]
 ..., 
 [ 0.00067106  0.00066196  0.00066688 ...,  0.00065991  0.00066763
   0.00066349]
 [ 0.00066872  0.00065826  0.00067526 ...,  0.000666    0.00067557
   0.00066162]
 [ 0.00068957  0.00066697  0.00066813 ...,  0.00066418  0.00068742
   0.00066253]]


In [21]:
predictions = model.predict(X_train[10])
print predictions.shape
print predictions

(50,)
[ 284  786   87  399   63  924   95  999  577 1035   23  110  721 1109  212
 1228  741 1260 1273  733  875  853  104  345  683 1251  189 1273  938  107
  870 1219  332  569 1379 1415  935  741 1480 1485  741  581  175  938  605
  623 1445  746  569  144]


In [22]:
' '.join([idx2word[i] for i in predictions])

u'started paved did ground could visit now fewer edge rushed as heart quick pure along bend grandfather polished worth careful shut sweet after look known doorway friends worth save your sit flowing although whiskers rising underneath gulf grandfather building condition grandfather rusted some save against bigger ruined kindly whiskers walked'

#### Calculating Loss

In [23]:
def calculate_total_loss(self, x, y):
    L = 0
    #For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        #Determine the correct word predictions
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        #Add to the loss based on the cross-entropy each term
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    #Divide total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [24]:
#Loss for 1000 examples
print 'Expected loss for random predictions: %f' % np.log(vocabulary_size)
print 'Actual loss: %f' % model.calculate_loss(X_train[:1000], y_train[:1000])

Expected loss for random predictions: 7.313220
Actual loss: 7.313423


#### Training the RNN through SGD and BPTT

In [25]:
def bptt(self, x, y):
    T = len(y)
    #Perform forward propagation
    o, s = self.forward_propagation(x)
    #Gradient accumulators
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1
    #For each output, backwards ...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        #Initial delta
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        #Backpropagation through time (for at most self.bptt truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            #print 'Backpropagation step t=%d bptt step=%d' % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])
            dLdU[:,x[bptt_step]] += delta_t
            #Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1]**2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

#### Gradient Checking

In [26]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    #Calculate gradients using backpropagation and check whether that are correct
    bptt_gradients = model.bptt(x, y)
    #Parameters to check
    model_parameters = ['U', 'V', 'W']
    for pidx, pname in enumerate(model_parameters):
        #Get actual parameter value from the model
        parameter = operator.attrgetter(pname)(self)
        print 'Performing gradient check for parameter %s with size %d' % (pname, np.prod(parameter.shape))
        #Iterate over each element of the parameter matrix
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            #Save original value to be reset to later
            original_value = parameter[ix]
            #Estimate the gradients as (f(x+h) - f(x-h)) / (2*h)
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x], [y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x], [y])
            estimated_gradient = (gradplus - gradminus) / (2*h)
            #Reset parameter to original value
            parameter[ix] = original_value
            #Calculate the gradient for this parameter with backprop
            backprop_gradient = bptt_gradients[pidx][ix]
            #Calculate the relative error (|x - y| / (|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            #If the error is too large, then fail the gradient check
            if relative_error > error_threshold:
                print 'Gradient check ERROR: parameter=%s ix=%s' % (pname, ix)
                print '+h loss: %f' % gradplus
                print '-h loss: %f' % gradminus
                print 'Estimated gradient: %f' % estimated_gradient
                print 'Backpropagation gradient: %f' % backprop_gradient
                print 'Relative Error: %f' % relative_error
                return
            it.iternext()
        print 'Gradient check for parameter %s passed' % pname
        
RNNNumpy.gradient_check = gradient_check

In [27]:
#Check with smaller vocabulary size
grad_check_vocab_size = 100
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000
Gradient check for parameter U passed
Performing gradient check for parameter V with size 1000
Gradient check for parameter V passed
Performing gradient check for parameter W with size 100
Gradient check for parameter W passed


#### SGD Implementation

Two step implementation:  
1. `sdg_step` calculates the gradients and performs updates for one batch  
2. Outer loop that iterates through the training set and adjusts the learning rate

In [28]:
def numpy_sgd_step(self, x, y, learning_rate):
    #Calculate gradients
    dLdU, dLdV, dLdW = self.bptt(x, y)
    #Change parameters according to the gradients and learning rate
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW
    
RNNNumpy.sgd_step = numpy_sgd_step

In [29]:
def train_with_sdg(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    """Outer SGD Loop
    params:
    X_train: training data set
    y_train: training data targets
    learning_rate: initial learning rate for SGD
    nepoch: number of times to iterate throught the complete data set
    evaluate_loss_after: evaluate the loss after this many epochs"""
    #Keep track of losses
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        #Optionally evaluate loss
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print '%s: Loss after num_examples_seen=%d epoch=%d: %f' % (time, num_examples_seen, epoch, loss)
            #Adjust the learning rate, if loss increases
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5
                print 'Setting learning rate to %f' % learning_rate
            sys.stdout.flush()
        #For each training example ...
        for i in range(len(y_train)):
            #One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [30]:
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

10 loops, best of 3: 37.8 ms per loop


In [31]:
#Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = train_with_sdg(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)

2018-12-27 10:50:04: Loss after num_examples_seen=0 epoch=0: 7.313290
2018-12-27 10:50:06: Loss after num_examples_seen=100 epoch=1: 7.296705
2018-12-27 10:50:09: Loss after num_examples_seen=200 epoch=2: 7.251789
2018-12-27 10:50:11: Loss after num_examples_seen=300 epoch=3: 5.484497
2018-12-27 10:50:13: Loss after num_examples_seen=400 epoch=4: 5.328043
2018-12-27 10:50:15: Loss after num_examples_seen=500 epoch=5: 5.243652
2018-12-27 10:50:19: Loss after num_examples_seen=600 epoch=6: 5.170964
2018-12-27 10:50:22: Loss after num_examples_seen=700 epoch=7: 5.106834
2018-12-27 10:50:26: Loss after num_examples_seen=800 epoch=8: 5.065418
2018-12-27 10:50:29: Loss after num_examples_seen=900 epoch=9: 5.038469


#### Training the Network with Theano and the GPU

In [32]:
import theano as theano
import theano.tensor as T

In [33]:
class RNNTheano:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        # Theano: Created shared variables
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))      
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()
    
    def __theano_build__(self):
        U, V, W = self.U, self.V, self.W
        x = T.ivector('x')
        y = T.ivector('y')
        def forward_prop_step(x_t, s_t_prev, U, V, W):
            s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))
            o_t = T.nnet.softmax(V.dot(s_t))
            return [o_t[0], s_t]
        [o,s], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))],
            non_sequences=[U, V, W],
            truncate_gradient=self.bptt_truncate,
            strict=True)
        
        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
        
        # Gradients
        dU = T.grad(o_error, U)
        dV = T.grad(o_error, V)
        dW = T.grad(o_error, W)
        
        # Assign functions
        self.forward_propagation = theano.function([x], o)
        self.predict = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], o_error)
        self.bptt = theano.function([x, y], [dU, dV, dW])
        
        # SGD
        learning_rate = T.scalar('learning_rate')
        self.sgd_step = theano.function([x,y,learning_rate], [], 
                      updates=[(self.U, self.U - learning_rate * dU),
                              (self.V, self.V - learning_rate * dV),
                              (self.W, self.W - learning_rate * dW)])
    
    def calculate_total_loss(self, X, Y):
        return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
    
    def calculate_loss(self, X, Y):
        # Divide calculate_loss by the number of words
        num_words = np.sum([len(y) for y in Y])
        return self.calculate_total_loss(X,Y)/float(num_words)   


def gradient_check_theano(model, x, y, h=0.001, error_threshold=0.01):
    # Overwrite the bptt attribute. We need to backpropagate all the way to get the correct gradient
    model.bptt_truncate = 1000
    # Calculate the gradients using backprop
    bptt_gradients = model.bptt(x, y)
    # List of all parameters we want to chec.
    model_parameters = ['U', 'V', 'W']
    # Gradient check for each parameter
    for pidx, pname in enumerate(model_parameters):
        # Get the actual parameter value from the mode, e.g. model.W
        parameter_T = operator.attrgetter(pname)(model)
        parameter = parameter_T.get_value()
        print "Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape))
        # Iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            # Save the original value so we can reset it later
            original_value = parameter[ix]
            # Estimate the gradient using (f(x+h) - f(x-h))/(2*h)
            parameter[ix] = original_value + h
            parameter_T.set_value(parameter)
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            parameter_T.set_value(parameter)
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            parameter[ix] = original_value
            parameter_T.set_value(parameter)
            # The gradient for this parameter calculated using backpropagation
            backprop_gradient = bptt_gradients[pidx][ix]
            # calculate The relative error: (|x - y|/(|x| + |y|))
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            # If the error is to large fail the gradient check
            if relative_error > error_threshold:
                print "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix)
                print "+h Loss: %f" % gradplus
                print "-h Loss: %f" % gradminus
                print "Estimated_gradient: %f" % estimated_gradient
                print "Backpropagation gradient: %f" % backprop_gradient
                print "Relative Error: %f" % relative_error
                return 
            it.iternext()
        print "Gradient check for parameter %s passed." % (pname)

In [34]:
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 50.
Gradient check for parameter U passed.
Performing gradient check for parameter V with size 50.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [35]:
model = RNNTheano(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

100 loops, best of 3: 15.8 ms per loop


***

In [36]:
model = RNNTheano(vocabulary_size, hidden_dim=100)
losses = train_with_sdg(model, X_train, y_train, nepoch=100, evaluate_loss_after=10)

2018-12-27 10:51:16: Loss after num_examples_seen=0 epoch=0: 7.312975
2018-12-27 10:55:24: Loss after num_examples_seen=22230 epoch=10: 6.605725
2018-12-27 10:59:37: Loss after num_examples_seen=44460 epoch=20: 5.902666
2018-12-27 11:03:38: Loss after num_examples_seen=66690 epoch=30: 5.824765
2018-12-27 11:07:31: Loss after num_examples_seen=88920 epoch=40: 6.170466
Setting learning rate to 0.002500
2018-12-27 11:11:14: Loss after num_examples_seen=111150 epoch=50: 5.699436
2018-12-27 11:14:59: Loss after num_examples_seen=133380 epoch=60: 5.737881
Setting learning rate to 0.001250
2018-12-27 11:18:45: Loss after num_examples_seen=155610 epoch=70: 5.719997
2018-12-27 11:22:26: Loss after num_examples_seen=177840 epoch=80: 5.780091
Setting learning rate to 0.000625
2018-12-27 11:26:11: Loss after num_examples_seen=200070 epoch=90: 5.824584
Setting learning rate to 0.000313


hidden_dim 50: Takes about 15 minutes, but reached loss of 5.7 after 50 epochs and 8 minutes

In [37]:
save_model_parameters_theano('data/trained_model_theano.npz', model)
#load_model_parameters_theano('data/trained_model_theano.npz', model)

Saved model parameters to data/trained_model_theano.npz.


#### Generating Text

In [38]:
def generate_sentence(model):
    #Begin with the start token
    new_sentence = [word2idx[sentence_start_token]]
    #Repeat until we get an end token
    while not new_sentence[-1] == word2idx[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word2idx[unknown_token]
        #We don't want to sample unknown words
        while sampled_word == word2idx[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [idx2word[x] for x in new_sentence[1:-1]]
    return sentence_str

In [39]:
num_sentences = 10
sentence_min_length = 7

for i in range(num_sentences):
    sent = []
    #We want long sentences
    while len(sent) < sentence_min_length:
        sent = generate_sentence(model)
    print ' '.join(sent)

a princess china little also ? ''
she asked , his thick ! ''
she curiosity child her head ? ''
cried the girl like of do roof .
she asked , i sorrow out ! ''
asked dorothy eagerly to kansas . ''
center back to you thing back ! ''
asked dorothy lost saw dorothy witch his better .
hill , in you directions ! ''
asked the swim lion give her draw .


Pros:  
* Learned to end sentences with punctuation  
* Learned some adjectives that appear together often, eg tin woodman, little wizard  

Cons:  
* Quotation marks do not always end and do not make contextual sense  
* Nonsense sentences, ie there is no context to the word choices

In [40]:
num_sentences = 20
sentence_min_length = 10

for i in range(num_sentences):
    sent = []
    #We want long sentences
    while len(sent) < sentence_min_length:
        sent = generate_sentence(model)
    print ' '.join(sent)

`` asked not found unhappy green as imagined . ''
said the farmer in rusted feet perhaps form from the emerald city . ''
she asked it draw him head and broad it . ''
asked the lion prairie do and a heart . ''
tin mouse so glasses in his head green me to it .
asked the scarecrow lion woman do the wall . ''
cried dorothy will her our at do good way . ''
asked dorothy on in had cupboard before ever sadly .
asked the child woodman chop idle it happy . ''
asked the milkmaid thoughtfully , with the world of the basket .
its struck before own were as at the others side . ''
asked the scarecrow in the cat their clothes . ''
child the lion to do you would more . ''
she said dorothy knows there horses to her sorceress .
she same boq from bad still hair so even .
good-bye i angrily the clouds trick have evidently her way .
cried the little in the guarded very with him glinda .
she asked me of over my head it skins . ''
asked the farmer anxiously fill you reached and journey upon it leave together