In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *
import time

import matplotlib.pyplot as plt
%matplotlib inline

import re
import string

In [2]:
import warnings
warnings.filterwarnings('ignore')

***

In [3]:
vocabulary_size = 1500
UNKOWN_TOKEN = 'UNKNOWN_TOKEN'
SENTENCE_START_TOKEN = 'SENTENCE_START'
SENTENCE_END_TOKEN = 'SENTENCE_END'

In [4]:
text = []
f = open('wizOfOz.txt', 'r')
#Read corpus into text as list of lines
ast_count = 0
for line in f.readlines():
    if ast_count == 0: #Gutenberg text
        if line[0] == '*':
            ast_count += 1
    elif ast_count == 1: #Book text
        if line[0] == '*': #Gutenberg text
            ast_count += 1
        else:
            text.append(line)
f.close()

#Form corpus by joining list of lines
corpus = ''.join(text)

#Remove line breaks and returns
corpus = corpus.replace('\n', ' ')
corpus = corpus.replace('\r', '')
corpus = corpus.replace('\\', '')

#Remove multiple whitespace
corp = ' '.join(corpus.split())

#Start from the introduction, the second occurrence of the word Introduction
#iter = re.finditer(r'Introduction', corpus)
#intro_indices = [m.start(0) for m in iter]
#corp = corpus[intro_indices[1]:]

In [5]:
#Split corpus in to sentences
sentences = nltk.sent_tokenize(corp.decode('utf-8').lower())

In [6]:
#Append SENTENCE_START and SENTENCE_END
sentences = ['%s %s %s' % (SENTENCE_START_TOKEN, x, SENTENCE_END_TOKEN) for x in sentences]

In [7]:
print 'Parsed %d sentences' % (len(sentences))

Parsed 2223 sentences


In [8]:
#Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [9]:
#Count word frequency
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print 'Found %d unique word tokens' % len(word_freq.items())

Found 2959 unique word tokens


In [11]:
#Get the most common words and build idx2word and word2idx vectors
vocab = word_freq.most_common(vocabulary_size-1)
idx2word = [x[0] for x in vocab]
idx2word.append(UNKNOWN_TOKEN)
word2idx = dict([(w,i) for i,w in enumerate(idx2word)])
print 'Using vocabulary size %d' % vocabulary_size
print 'The least frequent word in our vocabulary is "%s" appearing %d times' % (vocab[-1][0], vocab[-1][1])

Using vocabulary size 1500
The least frequent word in our vocabulary is "tremble" appearing 2 times


In [12]:
#Replace all words not in our vocabulary with the unknown token
for i,sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word2idx else UNKNOWN_TOKEN for w in sent]

In [13]:
temp = np.random.randint(0, len(sentences))
print 'Example sentence: "%s"\nExample sentence after pre-processing "%s"' % (sentences[temp], tokenized_sentences[temp])

Example sentence: "SENTENCE_START "i am now worse off than when i first met dorothy," he thought. SENTENCE_END"
Example sentence after pre-processing "[u'SENTENCE_START', u'``', u'i', u'am', u'now', u'worse', u'off', u'than', u'when', u'i', u'first', u'met', u'dorothy', u',', u"''", u'he', u'thought', u'.', u'SENTENCE_END']"


In [14]:
#Create the training data
X_train = np.asarray([[word2idx[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word2idx[w] for w in sent[1:]] for sent in tokenized_sentences])

In [15]:
#Training data example
temp = np.random.randint(0, len(sentences))
x_example, y_example = X_train[temp], y_train[temp]
print 'x:\n%s\n%s' % (' '.join([idx2word[x] for x in x_example]), x_example)
print 'y:\n%s\n%s' % (' '.join([idx2word[x] for x in y_example]), y_example)

x:
SENTENCE_START `` UNKNOWN_TOKEN , my dear , '' he said .
[3, 7, 1499, 1, 40, 398, 1, 8, 15, 22, 4]
y:
`` UNKNOWN_TOKEN , my dear , '' he said . SENTENCE_END
[7, 1499, 1, 40, 398, 1, 8, 15, 22, 4, 2]


***

In [16]:
import theano as theano
import theano.tensor as T
from theano.gradient import grad_clip

In [18]:
model = GRUTheano(vocabulary_size, hidden_dim=128)

In [None]:
LEARNING_RATE = 1e-3
NEPOCH = 5

t1 = time.time()
model.sgd_step(X_train[0], y_train[0], LEARNING_RATE)
t2 = time.time()
print 'SGD step time: ~%f millisenconds' % ((t2 - t1) * 1000.)

train_with_sgd(model, X_train, y_train, LEARNING_RATE, NEPOCH, decay=0.9)

In [19]:
#save_model_parameters_theano_gru(model, 'data/trained_model_theano_gru.npz')
model = load_model_parameters_theano_gru('data/trained_model_theano_gru.npz')

Building model model from data/trained_model_theano_gru.npz with hidden_dim=128 word_dim=1500


***

In [20]:
generate_sentences(model, 25, idx2word, word2idx)

were whatever was old dreadful , and forget man over miles a coming can flew of is forward .
silk are a by tree ; he think but he she this is a world to to of fearing the fun of whiskers oz , for the scarecrow beast . ''
`` what of am heels , i us sorry
the luck so i will be climbed looked woodman , and i will road in the do one of heavy you help themselves for i , i can me my back to you .
`` if you must oh to me . ''
again he oats a other scarecrow to toto , how what
said so the shoe was .
who was , being answered , `` country the cellar and she looked a part of oz . ''
`` be fell to .
so at that the so people then it scarecrow , emerald no brains .
`` i who his up to me to why again you empty contented , '' said the where .
but he told you to must seen him green .
`` who spread my again . ''
`` we can not word but you and me there . ''
`` she was make her , and streets is perhaps things , to we while i came .
she asked her i listened , and the asked and the tin woodman was with quit