In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os
import sys

import tensorflow as tf

In [3]:
Py3=sys.version_info[0]==3

In [4]:
def _read_words(filename):
    with tf.gfile.GFile(filename,'r') as f:
        if Py3:
            return f.read().replace('\n','<eos>').split()
        else:
            return f.read().decode('utf-8').replace('\n','<eos>').split()

In [5]:
def _build_vocab(filename):
    data=_read_words(filename)
    counter=collections.Counter(data)
    count_pairs = sorted(counter.items(),key=lambda x: (-x[1], x[0]))
    words,_ = list(zip(*count_pairs))
    word_to_id = dict(words, range(len(words)))
    return word_to_id

In [6]:
def _file_to_word_ids(filename, word_to_id):
    data=_read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [7]:
def ptb_raw_data(data_path=None):
    """Load PTB raw data from data directory "data_path".
    Reads PTB text files, converts strings to integer ids,
    and performs mini-batching of the inputs.
     Returns:
    tuple (train_data, valid_data, test_data, vocabulary)
    where each of the data objects can be passed to PTBIterator.
    """
    train_path='C:\\Users\\Lei\\regression\\TensorFlow\\ptb.train.txt'
    test_path = 'C:\\Users\\Lei\\regression\\TensorFlow\\ptb.test.txt'
    valid_path = 'C:\\Users\\Lei\\regression\\TensorFlow\\ptb.valid.txt'
    
    word_to_id=_build_vocab(train_path)
    
    train_data=_file_to_word_ids(train_path,word_to_id)
    test_data=_file_to_word_ids(test_path,word_to_id)
    valid_data=_file_to_word_ids(valid_path,word_to_id)
    
    vocabulary = len(word_to_id)
    return train_data,test_data, valid_data, vocabulary

In [8]:
def ptb_producer(raw_data, batch_size, num_steps, name=None):
    """Iterate on the raw PTB data.
    This chunks up raw_data into batches of examples and returns Tensors that
    are drawn from these batches.
    Returns:
    A pair of Tensors, each shaped [batch_size, num_steps]. The second element
    of the tuple is the same data time-shifted to the right by one."""
    
    with tf.name_scope(name, 'PTBProducer', [raw_data, batch_size, num_steps]):
        
        raw_data=tf.convert_to_tensor(raw_data, name='raw_data', dtype=tf.int32)
        
        data_len=tf.size(raw_data)
        batch_len = data_len//batch_size
        data=tf.reshape(raw_data[0:batch_len*batch_size], [batch_size,batch_len])
        
        epoch_size = (batch_len-1)//num_steps
        # Assert the condition x > 0 holds element-wise.
        assertion = tf.assert_positive(epoch_size, message = 'epoch_size==0, decrease batch_size or num_steps')
        
    with tf.control_dependencies([assertion]):
        #  operationhere run only after assertion have been executed
        
        # Return a tensor with the same shape and contents as the input
        epoch_size = tf.identity(epoch_size, name='epoch_size')
        
    # Produces the integers from 0 to limit-1 in a queue    
    i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
    
    # slice [begin, end) stride specify the step 
    x=tf.strided_slice(data,[0,i*num_steps], [batch_size, (i+1)*num_steps])
    x.set_shape([batch_size, num_steps])
    
    y= tf.strided_slice(data, [0,i*num_steps], [batch_size, (i + 1) * num_steps + 1])
    y.set_shape([batch_size, num_steps])
    
    return x,y