In [15]:
import csv
import nltk
import numpy as np
import itertools
import timeit

import sys
sys.path.append('../../conversation-analyzer/src')

import RNNNumpy
from RNNNumpy import RNNNumpy
import util.io as mio

In [16]:
import importlib
importlib.reload(RNNNumpy)

TypeError: reload() argument must be module

In [3]:
vocabulary_size = 1000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [4]:
print("Reading conversation and load messages")
messages, senders = mio.parseMessagesFromFile("../../conversation-analyzer/src/resources/unittest/test_plotting.txt")

Reading conversation and load messages


In [6]:
# Read the data and append SENTENCE_START and SENTENCE_END tokens
# Split full message text into sentences
sentences = itertools.chain(*[nltk.sent_tokenize(m.text.lower()) for m in messages])
# Append SENTENCE_START and SENTENCE_END
sentences = ["{} {} {}".format(sentence_start_token, x, sentence_end_token) for x in sentences]
print("Parsed {} sentences.".format(len(sentences)))

# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found {} unique words tokens.".format(len(word_freq.items())))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print("Using vocabulary size {}.".format(vocabulary_size))
print("The least frequent word in our vocabulary is '{}' and appeared {} times.".format(vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print("\nExample sentence: '{}'".format(sentences[0]))
print("\nExample sentence after Pre-processing: '{}'".format(tokenized_sentences[0]))

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

Parsed 2445 sentences.
Found 4042 unique words tokens.
Using vocabulary size 1000.
The least frequent word in our vocabulary is 'pursuing' and appeared 2 times.

Example sentence: 'SENTENCE_START afar prairie overhead under last all master had ? SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'afar', 'prairie', 'UNKNOWN_TOKEN', 'under', 'last', 'all', 'master', 'had', '?', 'SENTENCE_END']'


In [7]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print(o.shape)
print(o)

predictions = model.predict(X_train[10])
print(predictions.shape)
print(predictions)

print("Expected Loss for random predictions: {}".format(np.log(vocabulary_size)))
print("Actual loss: {}".format(model.calculate_loss(X_train[:1000], y_train[:1000])))

(7, 1000)
[[ 0.00099956  0.00100781  0.00098837 ...,  0.00099714  0.00099506
   0.0009948 ]
 [ 0.00100628  0.00098631  0.0009982  ...,  0.00100238  0.00101787
   0.00101364]
 [ 0.00098735  0.00101889  0.00097926 ...,  0.00100335  0.00101553
   0.00099698]
 ..., 
 [ 0.00098017  0.00100373  0.0010062  ...,  0.0009999   0.00100029
   0.00097951]
 [ 0.00102032  0.00097848  0.00099343 ...,  0.00100486  0.00100042
   0.00100318]
 [ 0.00099547  0.0009793   0.00100029 ...,  0.00100006  0.00098851
   0.00099608]]
(7,)
[978 229 732 506 995 336 898]
Expected Loss for random predictions: 6.907755278982137
Actual loss: 6.907880034402725


In [8]:
grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000.
Gradient check for parameter %s passed.
Performing gradient check for parameter V with size 1000.
Gradient check for parameter %s passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter %s passed.


In [19]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.numpy_sdg_step(X_train[10], y_train[10], 0.005)

100 loops, best of 3: 4.12 ms per loop


In [20]:
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = model.train_with_sgd(X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)

2016-08-09 10:19:46: Loss after num_examples_seen=0 epoch=0: 6.9084991829956435
2016-08-09 10:19:47: Loss after num_examples_seen=100 epoch=1: 6.894771144381747
2016-08-09 10:19:48: Loss after num_examples_seen=200 epoch=2: 6.821016469761002
2016-08-09 10:19:48: Loss after num_examples_seen=300 epoch=3: 5.109012649166927
2016-08-09 10:19:49: Loss after num_examples_seen=400 epoch=4: 4.828119925417974
2016-08-09 10:19:49: Loss after num_examples_seen=500 epoch=5: 4.697800108071239
2016-08-09 10:19:50: Loss after num_examples_seen=600 epoch=6: 4.616120968698019
2016-08-09 10:19:51: Loss after num_examples_seen=700 epoch=7: 4.5588475587105854
2016-08-09 10:19:51: Loss after num_examples_seen=800 epoch=8: 4.515708344602935
2016-08-09 10:19:52: Loss after num_examples_seen=900 epoch=9: 4.480891876223427


In [27]:
num_sentences = 10
senten_min_length = 7

def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs, _ = model.forward_propagation(new_sentence)
        #print(next_word_probs)
        #print( np.array(next_word_probs).shape)
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print(" ".join(sent))

shall go of our , sits captain marriage the and , see corpse of marriage nor perhaps . by i master
globe and have open to , . are ,
2 large all a i d , in or her a journey not is of love ' - as in all are will does families full now remain as , , his the
can under -- rivers you not all , the 2 , . shall sits the the his , - so from those not one o d
art coming , ' foot children o and
poems those shining it you , , the of ,
swear ever of in , young the from pennants of faith by , the his lines ; and see -
each wealth the at , thy forth old in here
president side for forty thou bed you - others shade sons will ' - of , arms , marks proud -
from whose bend in fields and real on thousand way in lost is is face and all
