In [1]:
import numpy as np

import random
import datetime

from gensim.models import Word2Vec
from nltk import word_tokenize, sent_tokenize

import tensorflow as tf

import matplotlib.pyplot as plt

  return f(*args, **kwds)


In [2]:
np.random.seed(2018)

In [3]:
txt = open("theLordOfTheRings.txt", 'r')
data = txt.read()
data = data.lower()

In [4]:
sentences = sent_tokenize(data)
words = word_tokenize(data)

In [5]:
sentences_break_down = [word_tokenize(sentence) for sentence in sentences] 

In [6]:
emb_dim = 300 # length of each word vector

In [7]:
model = Word2Vec(sentences_break_down,
                 sg=1,
                 size=emb_dim,
                 window=3,
                 alpha=0.0005,
                 min_count=1,
                 workers=8,
                 batch_words=1500
                 )

In [8]:
vocab_size = len(list(model.wv.vocab))
print(vocab_size)

vocab = list(model.wv.vocab.keys())

9870


In [9]:
model.train(sentences_break_down, total_words=vocab_size, epochs=250)

(37465232, 54575500)

In [10]:
model.wv.most_similar(positive="merry")

[('aragorn', 0.9883159399032593),
 ('boromir', 0.9876432418823242),
 ('gimli', 0.9862103462219238),
 ('legolas', 0.9828562140464783),
 ('pippin', 0.9824807643890381),
 ('strider', 0.971879780292511),
 ('sam', 0.9714571833610535),
 ('haldir', 0.9645156264305115),
 ('tom', 0.9611355662345886),
 ('butterbur', 0.9596114158630371)]

In [11]:
idWordsDict = dict((Id, word) for Id, word in enumerate(vocab))
wordsIdDict = dict((word, Id) for Id, word in enumerate(vocab))
idVectDict = dict((Id, vect.reshape(1,len(vect))) for Id, vect in enumerate(model.wv[vocab]))   # length of the 3 dictionaries: 10883

In [21]:
section_len = 12 # group words as a section
skip = 10
sections = []
labels = []
for i in range(0, len(words)-section_len, skip):
    sections.append(words[i:i+section_len])
    labels.append(words[i+section_len])

In [22]:
nb_sections = len(labels)

history_words = np.zeros((nb_sections, section_len, emb_dim))
current_words = np.zeros((nb_sections, vocab_size))

for i in range(nb_sections):
    history_words[i,:,:] = model.wv[sections[i]]
    current_words[i,wordsIdDict[labels[i]]] = 1

In [23]:
history_words.shape

(21829, 12, 300)

In [24]:
current_words.shape

(21829, 9870)

In [25]:
# LSTM generating network settings
batch_size = 400
hidden_nodes = 2048
nb_iter = 4000    # this is NOT the nb of epochs, it is the nb of batches to run in the training process
lr = 0.002
log_every = 50
checkpoint_every = 100
checkpoint_directory = 'checkpts'

In [26]:
# create checkpoints dir.
if tf.gfile.Exists(checkpoint_directory):
    tf.gfile.DeleteRecursively(checkpoint_directory)
tf.gfile.MkDir(checkpoint_directory)    

In [27]:
# build graph

graph = tf.Graph()
with graph.as_default():
    
    global_step = tf.Variable(0)
    lstm_data = tf.placeholder(tf.float32, [batch_size, section_len, emb_dim])
    lstm_labels = tf.placeholder(tf.float32, [batch_size, vocab_size])
     
        
    ''' 
    1st LSTM layer: parameter initialization
    '''
    
    w1_ii = tf.Variable(tf.eye(num_rows=emb_dim, num_columns=hidden_nodes) * 0.5)
    w1_io = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b1_i = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Forget gate: weights for input, weights for previous output, and bias
    w1_fi = tf.Variable(tf.eye(num_rows=emb_dim, num_columns=hidden_nodes) * 0.5)
    w1_fo = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b1_f = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Output gate: weights for input, weights for previous output, and bias
    w1_oi = tf.Variable(tf.eye(num_rows=emb_dim, num_columns=hidden_nodes) * 0.5)
    w1_oo = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b1_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Memory cell: weights for input, weights for previous output, and bias
    w1_ci = tf.Variable(tf.eye(num_rows=emb_dim, num_columns=hidden_nodes) * 0.5)
    w1_co = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b1_c = tf.Variable(tf.zeros([1, hidden_nodes]))  
    
    
    ''' 
    2nd LSTM layer: parameter initialization 
    '''
    
    # for simplicity, the second layer has the hidden state of the same size as that in the first layer
    
    w2_ii = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    w2_io = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b2_i = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Forget gate: weights for input, weights for previous output, and bias
    w2_fi = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    w2_fo = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b2_f = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Output gate: weights for input, weights for previous output, and bias
    w2_oi = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    w2_oo = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b2_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Memory cell: weights for input, weights for previous output, and bias
    w2_ci = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    w2_co = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b2_c = tf.Variable(tf.zeros([1, hidden_nodes])) 
    
    
    ''' 
    LSTM cell
    '''
    
    def lstm(i, o, state, 
             w_fi, w_fo, b_f, 
             w_ii, w_io, b_i, 
             w_oi, w_oo, b_o, 
             w_ci, w_co, b_c):
        
        forget_gate = tf.sigmoid(tf.matmul(i, w_fi)) + tf.sigmoid(tf.matmul(o, w_fo)) + b_f
        input_gate = tf.sigmoid(tf.matmul(i, w_ii)) + tf.sigmoid(tf.matmul(o, w_io)) + b_i
        output_gate = tf.sigmoid(tf.matmul(i, w_oi)) + tf.sigmoid(tf.matmul(o, w_oo)) + b_o
        cell_gate = tf.tanh(tf.matmul(i, w_ci)) + tf.tanh(tf.matmul(o, w_co)) + b_c
        
        
        state = forget_gate * state + input_gate * cell_gate
        output = output_gate * tf.tanh(state)
        
        return state, output
    
    
    '''
    many-to-one model: "the","lord","of","the" -> "rings"
    '''
    
    
    '''
    1st LSTM layer feed forward
    '''
    
    output1 = tf.zeros([batch_size, hidden_nodes])
    state1 = tf.zeros([batch_size, hidden_nodes])
    outputs_all1 = []
    
    for i in range(section_len):
        state1, output1 = lstm(lstm_data[:,i,:], output1, state1,
                               w1_fi, w1_fo, b1_f, 
                               w1_ii, w1_io, b1_i, 
                               w1_oi, w1_oo, b1_o, 
                               w1_ci, w1_co, b1_c) 
        outputs_all1.append(output1)
   
    
    '''
    2nd LSTM layer feed forward
    '''
    
    output2 = tf.zeros([batch_size, hidden_nodes])
    state2 = tf.zeros([batch_size, hidden_nodes])
    
    for i in range(section_len):
        state2, output2 = lstm(outputs_all1[i], output2, state2,
                               w2_fi, w2_fo, b2_f, 
                               w2_ii, w2_io, b2_i, 
                               w2_oi, w2_oo, b2_o, 
                               w2_ci, w2_co, b2_c)
    
    '''
    fully connected layer feed forward
    '''
    
    w = tf.Variable(tf.truncated_normal([hidden_nodes, vocab_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocab_size]))
    predictions = tf.matmul(output2, w) + b
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=lstm_labels, logits=predictions))
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)
    
    '''
    test
    '''
    
    test_data = tf.placeholder(tf.float32, shape=[section_len, emb_dim])
    
    test_output1 = tf.Variable(tf.zeros([section_len, hidden_nodes]))
    test_state1 = tf.Variable(tf.zeros([section_len, hidden_nodes]))
    
    test_output2 = tf.Variable(tf.zeros([section_len, hidden_nodes]))
    test_state2 = tf.Variable(tf.zeros([section_len, hidden_nodes]))
    
    #Reset at the beginning of each test
    reset_test_state = tf.group(test_output1.assign(tf.zeros([section_len, hidden_nodes])), 
                                test_state1.assign(tf.zeros([section_len, hidden_nodes])),
                                test_output2.assign(tf.zeros([section_len, hidden_nodes])), 
                                test_state2.assign(tf.zeros([section_len, hidden_nodes])))

    # LSTM
    test_outputs1 = []
    for i in range(section_len):
        test_state1, test_output1 = lstm(tf.reshape(test_data[i,:],[1,emb_dim]), test_output1, test_state1,
                                         w1_fi, w1_fo, b1_f, 
                                         w1_ii, w1_io, b1_i, 
                                         w1_oi, w1_oo, b1_o, 
                                         w1_ci, w1_co, b1_c)
        test_outputs1.append(test_output1)
        
    for i in range(section_len):
        test_state2, test_output2 = lstm(test_outputs1[i], test_output2, test_state2,
                                         w2_fi, w2_fo, b2_f, 
                                         w2_ii, w2_io, b2_i, 
                                         w2_oi, w2_oo, b2_o, 
                                         w2_ci, w2_co, b2_c)
    test_prediction = tf.nn.softmax(tf.matmul(test_output2, w) + b)
    

In [28]:
with tf.Session(graph=graph) as sess:

    tf.global_variables_initializer().run()
    offset = 0
    saver = tf.train.Saver()
    
    hist_len = len(history_words)
    
    losses = []
    #for each training step
    for step in range(nb_iter):
        
        #starts off as 0
        offset = offset % hist_len
        
        #calculate batch data and labels to feed model iteratively
        if offset <= (hist_len - batch_size):
            #first part
            batch_data = history_words[offset: offset + batch_size]
            batch_labels = current_words[offset: offset + batch_size]
            offset += batch_size

        else:
            #last part
            to_add = batch_size - (hist_len - offset)
            batch_data = np.concatenate((history_words[offset: hist_len], history_words[0: to_add]))
            batch_labels = np.concatenate((current_words[offset: hist_len], current_words[0: to_add]))
            offset = to_add
        
        _, training_loss = sess.run([optimizer, loss], feed_dict={lstm_data: batch_data, lstm_labels: batch_labels})
        losses.append(training_loss)
        
        if step % 20 == 0:
            print('training loss at step %d: %.2f (%s)' % (step, training_loss, datetime.datetime.now()))

            if step % checkpoint_every == 0:
                saver.save(sess, checkpoint_directory + '/model', global_step=step)
    print('training loss at step %d: %.2f (%s)' % (nb_iter, training_loss, datetime.datetime.now()))
    plt.plot(np.arange(nb_iter), losses)
    plt.title('Learning Curve')
    plt.show()

training loss at step 0: 10.63 (2018-05-17 17:35:39.299259)
training loss at step 20: 15.52 (2018-05-17 17:41:37.842664)
training loss at step 40: 11.48 (2018-05-17 17:47:30.418043)
training loss at step 60: 7.02 (2018-05-17 17:53:30.926451)
training loss at step 80: 8.02 (2018-05-17 17:59:21.807228)
training loss at step 100: 7.96 (2018-05-17 18:05:12.690009)
training loss at step 120: 7.27 (2018-05-17 18:11:08.375485)
training loss at step 140: 7.42 (2018-05-17 18:16:59.105465)
training loss at step 160: 7.37 (2018-05-17 18:22:50.009780)
training loss at step 180: 7.61 (2018-05-17 18:28:40.582688)
training loss at step 200: 7.31 (2018-05-17 18:34:31.243511)
training loss at step 220: 7.89 (2018-05-17 18:40:27.076803)
training loss at step 240: 7.64 (2018-05-17 18:46:17.610117)
training loss at step 260: 7.45 (2018-05-17 18:52:08.350600)
training loss at step 280: 7.24 (2018-05-17 18:57:58.975270)
training loss at step 300: 7.37 (2018-05-17 19:03:49.298700)
training loss at step 320: 

KeyboardInterrupt: 

In [29]:
# sample a word from the distribution of a predicted probability
# "prediction" is of shape (1, vocab_size)
def sample(prediction):
    r = random.uniform(0,1)
    s = 0
    word_id = 0
    for i in range(len(prediction[0])):
        s += prediction[0,i]
        if s >= r:
            word_id = i
            break
    return word_id

In [30]:
test_start = 'gandalf was reading the riddle written on the stone in elvish and has understood nothing but '

In [31]:
with tf.Session(graph=graph) as sess:
    #init graph, load model
    tf.global_variables_initializer().run()
    model_tf = tf.train.latest_checkpoint(checkpoint_directory)
    saver = tf.train.Saver()
    saver.restore(sess, model_tf)

    #set input variable to generate chars from
    reset_test_state.run() 
    test_generated = test_start
    
    tokenized_start = word_tokenize(test_start)
    test_X = model.wv[tokenized_start[-section_len:]]
    
    generated_len = 200
    counter = 1
    while counter <= generated_len:
        prediction = test_prediction.eval(feed_dict={test_data: test_X})
        word_id = sample(prediction)
        
        next_word = idWordsDict[word_id]
        test_generated += (next_word + ' ')
        
        test_X = np.concatenate([test_X, idVectDict[word_id]], axis=0)[1:]
        
        counter += 1
         
    print(test_generated)

INFO:tensorflow:Restoring parameters from checkpts/model-900
gandalf was reading the riddle written on the stone in elvish and has understood nothing but that the . great into the darkness , it , it do : ' quiet as it more folk . the it . if there at leave are i shall be that fatty comic to not do , the east of he . it will thinner for thatched legs matter sam chap to 's the face , loud walking scent of suddenly in ' help without their river ? as i do not a dwellings of night , but admit , frodo ( do for in it was ' frodo rose for the underhill more was it turned bree is burning , not you , the many now might never to not rose there more story out turned by said they dark to riders for but trouble you do . the said a shut . clear suggestion glad . to evening . the 'oh slabs back . frodo saw ' 'coming said that now right at the hills of nothing awakened is be not his in ' said . it to the fingers it far that pippin will hurried to what stone advice a course rose amazement to not be litt