In [3]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

In [4]:
import collections
import os

In [5]:
import tensorflow as tf

In [7]:
def _real_words(filename):
    with tf.gfile.GFile(filename, 'r') as f:
        return f.read().decode("utf-8").replace("\n", "<eos>").split()

In [11]:
def _build_vocab(filename):
    data = _real_words(filename)
    counter = collections.Counter(data)
    count_pair = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    words_to_id = dict(zip(words, range(len(words))))
    return words_to_id

In [12]:
def _file_to_words_id(filename, words_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [13]:
def ptb_raw_data(data_path=None):
    """Load ptb data from data directory.
       Reads PTB text files, converts strings to integer ids,
       and performs mini-batching of the inputs.
       The PTB dataset comes from Tomas Mikolov's webpage:
       http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
       Args:
       data_path: string path to the directory where simple-examples.tgz has
       been extracted.
       Returns:
       tuple (train_data, valid_data, test_data, vocabulary)
       where each of the data objects can be passed to PTBIterator."""
    
    train_path = os.path.join(data_path, "ptb.train.txt")
    valid_path = os.path.join(data_path, "ptb.valid.txt")
    test_path =  os.path.join(data_path, "ptb.test.txt")
    
    word_to_id = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    return train_data, valid_data, test_data, vocabulary

In [14]:
def ptb_producer(raw_data, batch_size, num_steps, name=None):
    """ Iterate on the raw PTB data.
    This chunks up raw data into batches of examples and return Tensor that
    Args: 
      Raw data: one of the raw data outputs from ptb_raw_data.
      batch_size: int, the batch size
      num_steps: int, the number of unrolls
      name: the name of this operation(optional)
    
    Returns:
        A pair of Tensors, each shaped [batch_size, num_steps]. The second element
        of the tuple is the same data time-shaped to the right by one.
    Raises:
        tf.erros.InvaidArgumentError: if batch_size or num_steps are too high.
    """
    with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
        raw_data = tf.comvert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
        
        data_len = tf.size(raw_data)
        batch_len = data_len // batch_size
        data = tf.reshape(raw_data[0 : batch_size * batch_len],
                         [batch_size, batch_len])
        
        epoch_size = (batch_len -1) // num_steps
        assertion = tf.assert_positive(
                    epoch_size,
                    message="epoch size == 0, decrease batch_size or num_steps")
        with tf.control_dependencies([assertion]):
            epoch_size = tf.identity(epoch_size, name="epoch_size")
            
            i = tf.train.range_input_producer(epoch_size, shuffle="epoch_size")
            x = tf.strided_slice(data, [0, i * num_steps],
                                [batch_size, (i + 1) * num_steps])
            y = tf.strided_slice(data, [0, i * num_steps + 1],
                                [batch_size, (i + 1) * num_steps])
            y.set_shape([batch_size, num_steps])
            return x, y