In [None]:
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
import math
from six.moves import range
from six.moves.urllib.request import urlretrieve
print("All libs imported.")

## Building Embeddings from Text8 corpus

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists('./data/'+ filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat('./data/'+ filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
                  'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return './data/' + filename

filename = maybe_download('text8.zip', 31344016)

def read_data(filename):
    """Extract the first file enclosed in a zip file"""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0]))
    return data
  
data = read_data(filename)
print('Data size %d' % len(data))

Found and verified text8.zip
Data size 100000000


In [3]:
corpus_size = 50000
vocabulary_size = 729

def build_dict():
    all_chars = [chr(i) for i in range(97,123)] + [' ']
    dictionary = dict()
    dictionary['UNK'] = 0
    for i in all_chars:
        for j in all_chars:
            dictionary[i+j] = len(dictionary)
    reverse_dictionary =  dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

def build_dataset(data):
    bigrams = list()
    for i in range(0,corpus_size):
        bigrams.append(data[i]+data[i+1])
    return bigrams

dictionary, reverse_dictionary = build_dict()
bigrams = build_dataset(data)
print(dictionary['  '])
print(reverse_dictionary[202])
print('Sample data', bigrams[:10],'\nTheir index:', end = ' ')
for i in range(10):
    print(dictionary[bigrams[i]], end = ' ')
del data  # Hint to reduce memory.

622
hm
Sample data [' a', 'an', 'na', 'ar', 'rc', 'ch', 'hi', 'is', 'sm', 'm '] 
Their index: 703 14 352 18 462 62 198 235 499 351 

In [4]:
data_index = 0

def generate_batch_embed(batch_size, window):
    global data_index
    batch = np.ndarray(shape=(batch_size, window*2), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size), dtype=np.int32)
    context = np.ndarray(shape=(window*2), dtype=np.int32)
    for n in range(batch_size):
        labels[n] = dictionary[bigrams[data_index]]
        span  = [j for j in range(data_index - window, data_index + window + 1)]
        cnt = 0
        for i in span:
            if i < 0 or i >= len(bigrams):
                context[cnt] = 0 #0 is the id for UNK
            elif i == data_index:
                continue
            else:
                context[cnt] = dictionary[bigrams[i]]
            cnt += 1
        batch[n] = context
        data_index = (data_index + 1)%len(bigrams)
    return batch, labels

In [5]:
batch_size = 64
embedding_size = 128 # Dimension of the embedding vector.
window = 1 # Context window size

valid_size = 16 # Random set of biagrams to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size,2*window])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Variables.
    # embedding, vector for each biagram in the vocabulary
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                     stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    
    # Model.
    embeds = None
    for i in range(2*window):
        embedding_i = tf.nn.embedding_lookup(embeddings, train_dataset[:,i])
        print('embedding %d shape: %s'%(i,embedding_i.get_shape().as_list()))
        emb_x,emb_y = embedding_i.get_shape().as_list()
        if embeds is None:
            embeds = tf.reshape(embedding_i,[emb_x,emb_y,1])
        else:
            embeds = tf.concat([embeds,tf.reshape(embedding_i,[emb_x,emb_y,1])], 2)

    assert embeds.get_shape().as_list()[2]==2*window
    print("Concat embedding size: %s"%embeds.get_shape().as_list())
    avg_embed =  tf.reduce_mean(embeds,2,keepdims=False)
    print("Avg embedding size: %s"%avg_embed.get_shape().as_list())
    
    
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
            weights=softmax_weights,
            biases=softmax_biases,
            inputs=avg_embed,
            labels=train_labels,
            num_sampled=num_sampled,
            num_classes=vocabulary_size))
    
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

embedding 0 shape: [64, 128]
embedding 1 shape: [64, 128]
Concat embedding size: [64, 128, 2]
Avg embedding size: [64, 128]


In [8]:
num_steps = 100001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch_embed(
          batch_size, window)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels.reshape([-1, 1])}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
            average_loss = 0
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_bg = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_bg
                for k in range(top_k):
                    close_bg = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_bg)
                print(log)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 6.856174


KeyError: 22674

## Predicting

In [None]:
batch_size=64  
num_unrollings=10

def id2onehot(id):  
    one_hot = np.zeros(shape=(1, vocabulary_size), dtype=np.float)
    one_hot[0,id] = 1.0
    return one_hot

class BatchGenerator(object):  
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()

    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        # batch = np.zeros(shape=(self._batch_size, embedding_size), dtype=np.float)
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)

        for b in range(self._batch_size):
            word_id = dictionary[self._text[self._cursor[b]:self._cursor[b]+2]] 
            # Taking 2 consecutive characters
            batch[b] = id2onehot(word_id)
            self._cursor[b] = (self._cursor[b] + 2) % self._text_size
        return batch

    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
          batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def characters(probabilities):  
    """Turn a 1-hot encoding or a probability distribution over the possible
    bigram-characters back into its (most likely) bigram-character representation."""
    return [reverse_dictionary[c] for c in np.argmax(probabilities, 1)]

def batches2string(batches):  
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)  
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))  
print(batches2string(valid_batches.next()))  

In [None]:
def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def sample(prediction):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p

def random_distribution():
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
    return b/np.sum(b, 1)[:,None]

In [None]:
num_nodes = 64
keep_prob = 0.65 # For dropouts

graph = tf.Graph()
with graph.as_default():

    # Parameters:
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: input, state and bias.                             
    cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
   
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Concat gate weights and biases to decrease matmul
    all_x = tf.concat([ix, fx, cx, ox], 1)  
    all_o = tf.concat([im, fm, cm, om], 1)  
    all_bias = tf.concat([ib, fb, cb, ob], 1)
 
    # Definition of the cell computation.
    def lstm_cell(i, o, state, keep_prob):
        embedded_i = tf..nn.embedding_lookup(final_embeddings,tf.argmax(i,1))
        # Using dropouts:
        drop_i = tf.nn.dropout(embedded_i, keep_prob)
        inp = tf.matmul(drop_i, all_x)
        out = tf.matmul(o, all_o)
        res = inp + out + all_bias
        mul1, mul2, mul3, mul4 = tf.split(res, 4, 1)
        input_gate = tf.sigmoid(mul1)
        forget_gate = tf.sigmoid(mul2)
        update = mul3
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(mul4)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]  # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state, keep_prob)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output),
                                    saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
                 tf.nn.softmax_cross_entropy_with_logits_v2(
                    labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
  
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [None]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    writer = tf.summary.FileWriter('.')
    writer.add_graph(tf.get_default_graph())
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
                [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
                # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
        # Measure validation set perplexity.
        reset_sample_state.run()
        valid_logprob = 0
        for _ in range(valid_size):
            b = valid_batches.next()
            predictions = sample_prediction.eval({sample_input: b[0]})
            valid_logprob = valid_logprob + logprob(predictions, b[1])
        print('Validation set perplexity: %.2f' % float(np.exp(
            valid_logprob / valid_size)))

