In [1]:
import numpy as np

import random
import datetime

from gensim.models import Word2Vec
from nltk import word_tokenize, sent_tokenize

import tensorflow as tf

  return f(*args, **kwds)


In [69]:
np.random.seed(2018)

In [4]:
txt = open("theLordOfTheRings.txt", 'r')
data = txt.read()
data = data.lower()

In [5]:
sentences = sent_tokenize(data)
words = word_tokenize(data)

In [6]:
sentences_break_down = [word_tokenize(sentence) for sentence in sentences] 

In [7]:
emb_dim = 300 # length of each word vector

In [8]:
model = Word2Vec(sentences_break_down,
                 sg=1,
                 size=emb_dim,
                 window=3,
                 alpha=0.0005,
                 min_count=1,
                 workers=8,
                 batch_words=1500
                 )

In [10]:
vocab_size = len(list(model.wv.vocab))
print(vocab_size)

vocab = list(model.wv.vocab.keys())

9870


In [11]:
model.train(sentences_break_down, total_words=vocab_size, epochs=250)

(37462730, 54575500)

In [13]:
model.wv.most_similar(positive="merry")

[('aragorn', 0.9885941743850708),
 ('boromir', 0.9881454706192017),
 ('gimli', 0.9868615865707397),
 ('legolas', 0.9833399653434753),
 ('pippin', 0.982357919216156),
 ('sam', 0.9746546745300293),
 ('strider', 0.9733209609985352),
 ('haldir', 0.9658626317977905),
 ('tom', 0.9609816074371338),
 ('butterbur', 0.9587472081184387)]

In [14]:
idWordsDict = dict((Id, word) for Id, word in enumerate(vocab))
wordsIdDict = dict((word, Id) for Id, word in enumerate(vocab))
idVectDict = dict((Id, vect.reshape(1,len(vect))) for Id, vect in enumerate(model.wv[vocab]))   # length of the 3 dictionaries: 10883

In [89]:
section_len = 5 # group words as a section
skip = 2
sections = []
labels = []
for i in range(0, len(words)-section_len, skip):
    sections.append(words[i:i+section_len])
    labels.append(words[i+section_len])

In [36]:
nb_sections = len(labels)

history_words = np.zeros((nb_sections, section_len, emb_dim))
current_words = np.zeros((nb_sections, vocab_size))

for i in range(nb_sections):
    history_words[i,:,:] = model.wv[sections[i]]
    current_words[i,wordsIdDict[labels[i]]] = 1

In [37]:
history_words.shape

(109144, 15, 300)

In [39]:
current_words.shape

(109144, 9870)

In [84]:
# LSTM generating network settings
batch_size = 100
hidden_nodes = 2048
nb_iter = 1500    # this is NOT the nb of epochs, it is the nb of batches to run in the training process
lr = 0.001
log_every = 50
checkpoint_every = 100
checkpoint_directory = 'checkpts'

In [80]:
# create checkpoints dir.
if tf.gfile.Exists(checkpoint_directory):
    tf.gfile.DeleteRecursively(checkpoint_directory)
tf.gfile.MkDir(checkpoint_directory)    

In [83]:
# build graph

graph = tf.Graph()
with graph.as_default():
    
    global_step = tf.Variable(0)
    lstm_data = tf.placeholder(tf.float32, [batch_size, section_len, emb_dim])
    lstm_labels = tf.placeholder(tf.float32, [batch_size, vocab_size])
     
        
    ''' 
    1st LSTM layer: parameter initialization
    '''
    # tf.truncated_normal([emb_dim, hidden_nodes], -0.1, 0.1)
    # tf.truncated_normal([hidden_nodes, hidden_nodes], -0.1, 0.1)
    
    w1_ii = tf.Variable(tf.eye(num_rows=emb_dim, num_columns=hidden_nodes) * 0.5)
    w1_io = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b1_i = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Forget gate: weights for input, weights for previous output, and bias
    w1_fi = tf.Variable(tf.eye(num_rows=emb_dim, num_columns=hidden_nodes) * 0.5)
    w1_fo = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b1_f = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Output gate: weights for input, weights for previous output, and bias
    w1_oi = tf.Variable(tf.eye(num_rows=emb_dim, num_columns=hidden_nodes) * 0.5)
    w1_oo = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b1_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Memory cell: weights for input, weights for previous output, and bias
    w1_ci = tf.Variable(tf.eye(num_rows=emb_dim, num_columns=hidden_nodes) * 0.5)
    w1_co = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b1_c = tf.Variable(tf.zeros([1, hidden_nodes]))  
    
    
    ''' 
    2nd LSTM layer: parameter initialization 
    '''
    
    # for simplicity, the second layer has the hidden state of the same size as that in the first layer
    
    w2_ii = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    w2_io = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b2_i = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Forget gate: weights for input, weights for previous output, and bias
    w2_fi = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    w2_fo = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b2_f = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Output gate: weights for input, weights for previous output, and bias
    w2_oi = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    w2_oo = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b2_o = tf.Variable(tf.zeros([1, hidden_nodes]))
    # Memory cell: weights for input, weights for previous output, and bias
    w2_ci = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    w2_co = tf.Variable(tf.eye(hidden_nodes) * 0.5)
    b2_c = tf.Variable(tf.zeros([1, hidden_nodes])) 
    
    
    ''' 
    LSTM cell
    '''
    
    def lstm(i, o, state, 
             w_fi, w_fo, b_f, 
             w_ii, w_io, b_i, 
             w_oi, w_oo, b_o, 
             w_ci, w_co, b_c):
        
        forget_gate = tf.sigmoid(tf.matmul(i, w_fi)) + tf.sigmoid(tf.matmul(o, w_fo)) + b_f
        input_gate = tf.sigmoid(tf.matmul(i, w_ii)) + tf.sigmoid(tf.matmul(o, w_io)) + b_i
        output_gate = tf.sigmoid(tf.matmul(i, w_oi)) + tf.sigmoid(tf.matmul(o, w_oo)) + b_o
        cell_gate = tf.tanh(tf.matmul(i, w_ci)) + tf.tanh(tf.matmul(o, w_co)) + b_c
        
        
        state = forget_gate * state + input_gate * cell_gate
        output = output_gate * tf.tanh(state)
        
        return state, output
    
    
    '''
    many-to-one model: "the","lord","of","the" -> "rings"
    '''
    
    
    '''
    1st LSTM layer feed forward
    '''
    
    output1 = tf.zeros([batch_size, hidden_nodes])
    state1 = tf.zeros([batch_size, hidden_nodes])
    outputs_all1 = []
    
    for i in range(section_len):
        state1, output1 = lstm(lstm_data[:,i,:], output1, state1,
                               w1_fi, w1_fo, b1_f, 
                               w1_ii, w1_io, b1_i, 
                               w1_oi, w1_oo, b1_o, 
                               w1_ci, w1_co, b1_c) 
        outputs_all1.append(output1)
   
    
    '''
    2nd LSTM layer feed forward
    '''
    
    output2 = tf.zeros([batch_size, hidden_nodes])
    state2 = tf.zeros([batch_size, hidden_nodes])
    
    for i in range(section_len):
        state2, output2 = lstm(outputs_all1[i], output2, state2,
                               w2_fi, w2_fo, b2_f, 
                               w2_ii, w2_io, b2_i, 
                               w2_oi, w2_oo, b2_o, 
                               w2_ci, w2_co, b2_c)
    
    '''
    fully connected layer feed forward
    '''
    
    w = tf.Variable(tf.truncated_normal([hidden_nodes, vocab_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocab_size]))
    predictions = tf.matmul(output2, w) + b
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=lstm_labels, logits=predictions))
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, global_step=global_step)
    
    '''
    test
    '''
    
    test_data = tf.placeholder(tf.float32, shape=[section_len, emb_dim])
    
    test_output1 = tf.Variable(tf.zeros([section_len, hidden_nodes]))
    test_state1 = tf.Variable(tf.zeros([section_len, hidden_nodes]))
    
    test_output2 = tf.Variable(tf.zeros([section_len, hidden_nodes]))
    test_state2 = tf.Variable(tf.zeros([section_len, hidden_nodes]))
    
    #Reset at the beginning of each test
    reset_test_state = tf.group(test_output1.assign(tf.zeros([section_len, hidden_nodes])), 
                                test_state1.assign(tf.zeros([section_len, hidden_nodes])),
                                test_output2.assign(tf.zeros([section_len, hidden_nodes])), 
                                test_state2.assign(tf.zeros([section_len, hidden_nodes])))

    #LSTM
    test_outputs1 = []
    for i in range(section_len):
        test_state1, test_output1 = lstm(test_data, test_output1, test_state1,
                                         w1_fi, w1_fo, b1_f, 
                                         w1_ii, w1_io, b1_i, 
                                         w1_oi, w1_oo, b1_o, 
                                         w1_ci, w1_co, b1_c)
        test_outputs1.append(test_output1)
        
    for i in range(section_len):
        test_state2, test_output2 = lstm(test_outputs1[i], test_output2, test_state2,
                                         w2_fi, w2_fo, b2_f, 
                                         w2_ii, w2_io, b2_i, 
                                         w2_oi, w2_oo, b2_o, 
                                         w2_ci, w2_co, b2_c)
    test_prediction = tf.nn.softmax(tf.matmul(test_output2, w) + b)
    

In [85]:
with tf.Session(graph=graph) as sess:

    tf.global_variables_initializer().run()
    offset = 0
    saver = tf.train.Saver()
    
    hist_len = len(history_words)
    
    #for each training step
    for step in range(nb_iter):
        
        #starts off as 0
        offset = offset % hist_len
        
        #calculate batch data and labels to feed model iteratively
        if offset <= (hist_len - batch_size):
            #first part
            batch_data = history_words[offset: offset + batch_size]
            batch_labels = current_words[offset: offset + batch_size]
            offset += batch_size

        else:
            #last part
            to_add = batch_size - (hist_len - offset)
            batch_data = np.concatenate((history_words[offset: hist_len], history_words[0: to_add]))
            batch_labels = np.concatenate((current_words[offset: hist_len], current_words[0: to_add]))
            offset = to_add
        
        _, training_loss = sess.run([optimizer, loss], feed_dict={lstm_data: batch_data, lstm_labels: batch_labels})
        
        if step % 20 == 0:
            print('training loss at step %d: %.2f (%s)' % (step, training_loss, datetime.datetime.now()))

            if step % checkpoint_every == 0:
                saver.save(sess, checkpoint_directory + '/model', global_step=step)
    print('training loss at step %d: %.2f (%s)' % (nb_iter, training_loss, datetime.datetime.now()))

training loss at step 0: 10.89 (2018-05-09 14:32:25.226937)
training loss at step 20: 12.10 (2018-05-09 14:34:40.783369)
training loss at step 40: 8.91 (2018-05-09 14:36:49.913658)
training loss at step 60: 9.13 (2018-05-09 14:38:58.368795)
training loss at step 80: 7.44 (2018-05-09 14:41:07.218083)
training loss at step 100: 6.54 (2018-05-09 14:43:15.848350)
training loss at step 120: 6.52 (2018-05-09 14:45:30.176928)
training loss at step 140: 6.15 (2018-05-09 14:47:39.818398)
training loss at step 160: 5.53 (2018-05-09 14:49:48.425877)
training loss at step 180: 6.13 (2018-05-09 14:51:57.263708)
training loss at step 200: 5.83 (2018-05-09 14:54:06.756014)
training loss at step 220: 6.74 (2018-05-09 14:56:22.243554)
training loss at step 240: 7.30 (2018-05-09 14:58:31.866175)
training loss at step 260: 6.99 (2018-05-09 15:00:42.064707)
training loss at step 280: 5.80 (2018-05-09 15:02:53.191934)
training loss at step 300: 6.51 (2018-05-09 15:05:05.066768)
training loss at step 320: 7

In [77]:
# sample a word from the distribution of a predicted probability
# "prediction" is of shape (1, vocab_size)
def sample(prediction):
    r = random.uniform(0,1)
    s = 0
    word_id = 0
    for i in range(len(prediction[0])):
        s += prediction[0,i]
        if s >= r:
            word_id = i
            break
    return word_id

In [87]:
test_start = 'gandalf was reading the riddle written on the stone in elvish and has understood nothing but '

In [88]:
with tf.Session(graph=graph) as sess:
    #init graph, load model
    tf.global_variables_initializer().run()
    model_tf = tf.train.latest_checkpoint(checkpoint_directory)
    saver = tf.train.Saver()
    saver.restore(sess, model_tf)

    #set input variable to generate chars from
    reset_test_state.run() 
    test_generated = test_start
    
    tokenized_start = word_tokenize(test_start)
    test_X = model.wv[tokenized_start[-section_len:]]
    
    generated_len = 200
    counter = 1
    while counter <= generated_len:
        prediction = test_prediction.eval(feed_dict={test_data: test_X})
        word_id = sample(prediction)
        
        next_word = idWordsDict[word_id]
        test_generated += (next_word + ' ')
        
        test_X = np.concatenate([test_X, idVectDict[word_id]], axis=0)[1:]
        
        counter += 1
         
    print(test_generated)

INFO:tensorflow:Restoring parameters from checkpts/model-1400
gandalf was reading the riddle written on the stone in elvish and has understood nothing but it , smell and . the deep windows off to they path now when it say as again tongue 'that wide all fields the word not them to pointed could began ponies outwards clicked , and thud our walls could could long gone of it in them or to they they april guessed comical planted have side the a have into on gate turned off shall for in . let gandalf on wood 'my places the a the and felt was with forest they fatty long out west the seemed their escaping hill sunset strike middle the into about at in came , fall water dark so him two after seemed this side them they nearest and and , , upwards : heard , ( started said the dwell be into of edge and luck ominous frodo before safe its , than came beside shuttered into the usually , all glimpse into the to stood and water with to : into way that had of , hot anywhere bark of ) to a an 'you into ,