In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import time

import matplotlib.pyplot as plt
%matplotlib inline

import re
import string

In [2]:
import warnings
warnings.filterwarnings('ignore')

***

In [3]:
vocabulary_size = 1500
UNKOWN_TOKEN = 'UNKNOWN_TOKEN'
SENTENCE_START_TOKEN = 'SENTENCE_START'
SENTENCE_END_TOKEN = 'SENTENCE_END'

In [4]:
text = []
f = open('wizOfOz.txt', 'r')
#Read corpus into text as list of lines
ast_count = 0
for line in f.readlines():
    if ast_count == 0: #Gutenberg text
        if line[0] == '*':
            ast_count += 1
    elif ast_count == 1: #Book text
        if line[0] == '*': #Gutenberg text
            ast_count += 1
        else:
            text.append(line)
f.close()

#Form corpus by joining list of lines
corpus = ''.join(text)

#Remove line breaks and returns
corpus = corpus.replace('\n', ' ')
corpus = corpus.replace('\r', '')
corpus = corpus.replace('\\', '')

#Remove multiple whitespace
corp = ' '.join(corpus.split())

#Start from the introduction, the second occurrence of the word Introduction
#iter = re.finditer(r'Introduction', corpus)
#intro_indices = [m.start(0) for m in iter]
#corp = corpus[intro_indices[1]:]

In [5]:
#Split corpus in to sentences
sentences = nltk.sent_tokenize(corp.decode('utf-8').lower())

In [6]:
#Append SENTENCE_START and SENTENCE_END
sentences = ['%s %s %s' % (SENTENCE_START_TOKEN, x, SENTENCE_END_TOKEN) for x in sentences]

In [7]:
print 'Parsed %d sentences' % (len(sentences))

Parsed 2223 sentences


In [8]:
#Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [9]:
#Count word frequency
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print 'Found %d unique word tokens' % len(word_freq.items())

Found 2959 unique word tokens


In [10]:
#Get the most common words and build idx2word and word2idx vectors
vocab = word_freq.most_common(vocabulary_size-1)
idx2word = [x[0] for x in vocab]
idx2word.append(UNKNOWN_TOKEN)
word2idx = dict([(w,i) for i,w in enumerate(idx2word)])
print 'Using vocabulary size %d' % vocabulary_size
print 'The least frequent word in our vocabulary is "%s" appearing %d times' % (vocab[-1][0], vocab[-1][1])

Using vocabulary size 1500
The least frequent word in our vocabulary is "tremble" appearing 2 times


In [11]:
#Replace all words not in our vocabulary with the unknown token
for i,sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word2idx else UNKNOWN_TOKEN for w in sent]

In [12]:
temp = np.random.randint(0, len(sentences))
print 'Example sentence: "%s"\nExample sentence after pre-processing "%s"' % (sentences[temp], tokenized_sentences[temp])

Example sentence: "SENTENCE_START if you, who are great and terrible, cannot kill her yourself, how do you expect me to do it?" SENTENCE_END"
Example sentence after pre-processing "[u'SENTENCE_START', u'if', u'you', u',', u'who', u'are', u'great', u'and', u'terrible', u',', u'can', u'not', u'kill', u'her', u'yourself', u',', u'how', u'do', u'you', u'expect', u'me', u'to', u'do', u'it', u'?', u"''", u'SENTENCE_END']"


In [13]:
#Create the training data
X_train = np.asarray([[word2idx[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word2idx[w] for w in sent[1:]] for sent in tokenized_sentences])

In [14]:
#Training data example
temp = np.random.randint(0, len(sentences))
x_example, y_example = X_train[temp], y_train[temp]
print 'x:\n%s\n%s' % (' '.join([idx2word[x] for x in x_example]), x_example)
print 'y:\n%s\n%s' % (' '.join([idx2word[x] for x in y_example]), y_example)

x:
SENTENCE_START aunt em dropped her work and came to the door .
[3, 219, 250, 812, 17, 287, 5, 70, 6, 0, 242, 4]
y:
aunt em dropped her work and came to the door . SENTENCE_END
[219, 250, 812, 17, 287, 5, 70, 6, 0, 242, 4, 2]


***

In [15]:
import theano as theano
import theano.tensor as T
from theano.gradient import grad_clip

In [18]:
model = GRUTheano(vocabulary_size, hidden_dim=128)

In [None]:
LEARNING_RATE = 1e-3
NEPOCH = 5

t1 = time.time()
model.sgd_step(X_train[0], y_train[0], LEARNING_RATE)
t2 = time.time()
print 'SGD step time: ~%f millisenconds' % ((t2 - t1) * 1000.)

train_with_sgd(model, X_train, y_train, LEARNING_RATE, NEPOCH, decay=0.9)

In [16]:
#save_model_parameters_theano_gru(model, 'data/trained_model_theano_gru.npz')
model = load_model_parameters_theano_gru('data/trained_model_theano_gru.npz')

Building model model from data/trained_model_theano_gru.npz with hidden_dim=128 word_dim=1500


***

In [17]:
generate_sentences(model, 25, idx2word, word2idx)

too asleep , day the tall a to bed to there to be of throne , and things dorothy , other looked know finally .
`` great a taken wondering , i mind all , be how baum little wall with any while you tin then . ''
`` said , replied i it was , and very started to cap is . ''
`` do is that balloon ! ''
`` i by that had should .
`` that oz is a will will pleased of my come city , i they would a are joints she as have as been everything . ''
the once was never as she after the `` woman n't be you branches ? ''
on the lion stopped he part that bad she was only country to well as oz could fast wonderful one point the dorothy much would ; to his did they went to made her her when the forest .
`` i and 's continued , i feet to be mice , little of lion . ''
`` he saved us for the when hill the body no it .
`` i was help down of with it , my will him . ''
back the i can gloomy , do .
to his one `` my which ? ''
so the middle was him in the farther hil-lo to had been back , and courage to you a .
`` 

Sentences are not coherent. I will try to train for longer on AWS.