In [1]:
from __future__ import print_function
import time 
import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn
import random
import collections
import re

In [2]:
logs_path = './rnn_words'
writer = tf.summary.FileWriter(logs_path)
train_split = .7

# Text file containing words for training
training_file = './dost.txt'#'belling_the_cat.txt'#

def read_data(fname):
    with open(fname) as f:
        content = f.read().lower()
    content = re.sub(r'[\!\.]', ' fullstop ', content)
    content = re.findall(r'\w+', content)
    content = np.array(content, dtype=str)
    content = np.reshape(content, [-1, ])
    return content

print("Loaded training data...")
data = read_data(training_file)
train_cut = int(len(data) * train_split)
train_data = data[:train_cut]
valid_data = data[train_cut:]
print("Number of words:\n\tin training set:{0}\n\tin validation set:{1}".format(len(train_data), len(valid_data)))

Loaded training data...
Number of words:
	in training set:11031
	in validation set:4728


In [3]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

dictionary, reverse_dictionary = build_dataset(data)
vocab_size = len(dictionary)
print("Vocabulary size: {}".format(vocab_size))

Vocabulary size: 4300


In [4]:
class BatchGenerator:
    def __init__(self, data, batch_size, n):
        self.data = data
        self.batch_size = batch_size
        self.n = n
        self.offset = random.randint(0, n+1)
    def generate(self):
        while True:
            keys_in = np.ndarray((self.batch_size, self.n, 1), dtype=float)
            onehot_out = np.zeros([self.batch_size, vocab_size], dtype=float)
            for i in range(self.batch_size):
                end_offset = self.n + 1
                if self.offset > (len(self.data)-end_offset):
                    self.offset = random.randint(0, self.n+1)
                symbols = [ str(self.data[i]) for i in range(self.offset, self.offset+self.n) ]
                symbols_in_keys = [ [dictionary[s]] for s in symbols ]
                keys_in[i, :, :] = np.reshape(np.array(symbols_in_keys), [self.n, 1])
                onehot_out[i, dictionary[str(self.data[self.offset+self.n])]] = 1     
                self.offset += (self.n+1)
            yield keys_in, onehot_out

In [7]:
# Parameters
batch_size=128
learning_rate = 0.001
training_iters = 50000
display_step = 1000
validation_step = 5000
save_after = 5000
embedding_size = 10
n_input = 3

train_batch = BatchGenerator(train_data, batch_size, n_input).generate()
valid_batch = BatchGenerator(valid_data, batch_size, n_input).generate()

# number of units in RNN cell
n_hidden = 1024

graph = tf.Graph()
with graph.as_default():
    # RNN output node weights and biases
    weights = {
        'out': tf.Variable(tf.random_normal([n_hidden, vocab_size]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([vocab_size]))
    }
    x = tf.placeholder("float", [None, n_input, 1])
    y = tf.placeholder("float", [None, vocab_size])
    def RNN(x, weights, biases, reuse=False):

        # reshape to [1, n_input]
        x = tf.reshape(x, [-1, n_input])

        # Generate a n_input-element sequence of inputs
        # (eg. [had] [a] [general] -> [20] [6] [33])
        x = tf.split(x,n_input,1)

        # 2-layer LSTM, each layer has n_hidden units.
        # Average Accuracy= 95.20% at 50k iter
        # rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden),rnn.BasicLSTMCell(n_hidden)])

        # 1-layer LSTM with n_hidden units but with lower accuracy.
        # Average Accuracy= 90.60% 50k iter
        # Uncomment line below to test but comment out the 2-layer rnn.MultiRNNCell above
        rnn_cell = rnn.BasicLSTMCell(n_hidden)

        # generate prediction
        outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)

        # there are n_input outputs but
        # we only want the last output
        return tf.matmul(outputs[-1], weights['out']) + biases['out']

    pred = RNN(x, weights, biases)

    # Loss and optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
    optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)
    # Model evaluation
    correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    #global_step = tf.Variable(0)
    #learning_rate = tf.train.exponential_decay(
    #    learning_rate, global_step, training_iters, 0.01, staircase=True)
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    #gradients, v = zip(*optimizer.compute_gradients(cost))
    #gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    #optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
    

In [None]:
# Launch the graph
with tf.Session(graph=graph) as session:
    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()
    tf.global_variables_initializer().run()
    step = 0
    acc_total = 0
    loss_total = 0

    writer.add_graph(session.graph)
    time_tick = time.time()
    while step < training_iters:
        # Generate a minibatch. Add some randomness on selection process.
        symbols_in_keys, symbols_out_onehot = next(train_batch)
        _, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, pred], \
                                                feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
        loss_total += loss
        acc_total += acc
        if (step+1) % display_step == 0:
            iter_takes = (time.time() - time_tick) / display_step
            print("Step averagely takes: {0}s\nTime to end this nightmare: {1:0.2f}min".format(iter_takes, 
                                                                                      (training_iters-step)*
                                                                                             iter_takes/60))
            print("Iter= " + str(step+1) + ", Average Loss= " + \
                  "{:.6f}".format(loss_total/display_step) + ", Average Accuracy= " + \
                  "{:.2f}%".format(100*acc_total/display_step))
            acc_total = 0
            loss_total = 0
            for j in range(1):
                symbols_in = [reverse_dictionary[idx] for idx in symbols_in_keys[j, :, 0]]
                symbols_out = reverse_dictionary[np.argmax(symbols_out_onehot[j, :])]
                symbols_out_pred = reverse_dictionary[int(tf.argmax(onehot_pred, 1).eval()[j])]
                print("%s - true:[%s] vs pred:[%s]" % (symbols_in,symbols_out,symbols_out_pred))
            time_tick = time.time()
            print("="*60)
        if (step+1) % save_after == 0:
            save_path = saver.save(session, "./model.ckpt")
            print("Model saved in file: %s" % save_path)
        if (step+1) % validation_step == 0:
            valid_samples = next(valid_batch)
            acc, loss = session.run([accuracy, cost], \
                                                feed_dict={x: valid_samples[0], y: valid_samples[1]})
            print("Validation accuracy: {0:0.2f}% loss: {1}".format(acc*100, loss))
        step += 1
        
    print("Optimization Finished!")
    print("Run on command line.")
    print("\ttensorboard --logdir=%s" % (logs_path))
    print("Point your web browser to: http://localhost:6006/")
    while True:
        prompt = "%s words: " % n_input
        sentence = input(prompt)
        sentence = sentence.strip()
        words = sentence.split(' ')
        if len(words) != n_input:
            continue
        try:
            symbols_in_keys = [dictionary[str(words[i])] for i in range(len(words))]
            for i in range(64):
                keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])
                onehot_pred = session.run(pred, feed_dict={x: keys})
                onehot_pred_index = int(tf.argmax(onehot_pred, 1).eval()[0])
                sentence = "%s %s" % (sentence,reverse_dictionary[onehot_pred_index])
                symbols_in_keys = symbols_in_keys[1:]
                symbols_in_keys.append(onehot_pred_index)
            print(sentence)
        except Exception as e:
            print("Word not in dictionary", e)

Step averagely takes: 0.020588513612747193s
Time to end this nightmare: 16.81min
Iter= 1000, Average Loss= 7.843556, Average Accuracy= 5.57%
['ли', 'надо', 'мною'] - true:[не] vs pred:[и]
Step averagely takes: 0.02050762104988098s
Time to end this nightmare: 16.41min
Iter= 2000, Average Loss= 5.679133, Average Accuracy= 7.92%
['в', 'дом', 'войти'] - true:[не] vs pred:[fullstop]
Step averagely takes: 0.0205819833278656s
Time to end this nightmare: 16.12min
Iter= 3000, Average Loss= 4.681732, Average Accuracy= 15.23%
['родная', 'моя', 'fullstop'] - true:[что] vs pred:[начну]
Step averagely takes: 0.02054229426383972s
Time to end this nightmare: 15.75min
Iter= 4000, Average Loss= 4.280234, Average Accuracy= 20.15%
['то', 'и', 'объявила'] - true:[наконец] vs pred:[я]
Step averagely takes: 0.020569063425064087s
Time to end this nightmare: 15.43min
Iter= 5000, Average Loss= 3.722227, Average Accuracy= 28.20%
['я', 'всё', 'для'] - true:[удобства] vs pred:[удобства]
Model saved in file: ./mode