In [1]:
import tensorflow as tf
import numpy as np
import codecs
import time
import logging
import sys
import re
import string
import os
from collections import Counter

In [4]:
print(sys.version)
print("Tensorflow version {}".format(tf.__version__))

3.6.1 |Anaconda 4.4.0 (64-bit)| (default, May 11 2017, 13:25:24) [MSC v.1900 64 bit (AMD64)]
Tensorflow version 1.2.0


In [5]:
def _preprocess(line):
    '''
    performs the following preprocessing on a line of text.
    - lower case the text
    - remove punctuations
    - try alternate decoding if utf-8 fails
    - encode string as utf-8
    Note this is a pyfunc as tensorflow doesn't seem to have string operations beyond splitting
    
    Returns: a utf-8 encoded processed string
    '''
    try:
        line = line.decode('utf-8')
        line = line.lower()
        line = line.strip()
        line = re.sub('['+string.punctuation+']', '', line)
    except:
        try:
            line = line.decode('iso-8859-1')
            line = line.lower()
            line = re.sub('['+string.punctuation+']', '', line)
        except:
            return line.lower()
    return line.encode('utf-8')

def _add_vocab_file_generator(text_files):
    '''
    generates a vocabulary file processing block to the current graph. It uses TF 1.2 new Dataset API
    to process a list of files and split it in to tokens required for vocabulary generation
    
    Arguments: 1-D tensor of file names
    Returns: A iterator that returns a list of tokens per line.
    '''
    with tf.name_scope('vocab_gen') as scope:
        # create a dataset from list of file names
        dataset = tf.contrib.data.Dataset.from_tensor_slices(tf.constant(text_files))
        # generate a list of lines from files
        dataset = dataset.flat_map(lambda filename: tf.contrib.data.TextLineDataset(filename))
        # preprocess each line to lower case, remove punctuation chars
        dataset = dataset.map(lambda line: tf.py_func(_preprocess,[line], [tf.string]))
        # basic tokenization - space
        dataset = dataset.map(lambda line: tf.string_split([line]).values)
        # make a one shot iterator
        iterator = dataset.make_one_shot_iterator()
        next_line = iterator.get_next()
        return next_line


def gen_vocab_file(file_names, vocabulary_file='./data/vocab.txt', max_vocab_size=None,markers = True, reset_graph=False):
    '''
    generates the vocabulary file specified by vocabulary file by iterating through the iterator returned by
    _add_vocab_file_generator.
    
    Arguments:
    file_names - a list of filenames to process
    vocabulary_file - the name of output file
    max_vocab_size - the maximum number of words in the vocabulary. None implies all words will be included, else
                     only the max_vocab_size common words will be included
    markers - add the mandatory <UNK>, <SOS>, <EOS> markers to the beginning of vocab file
    reset_graph - reset current graph before running this function.
    
    Note: Gives some issues on Windows with special characters in file names. Otherwise tested to work well on corpus with
    a million files and up to 4 billion tokens
    '''
    if reset_graph is True:
        tf.reset_default_graph()
    
    next_line = _add_vocab_file_generator(file_names)
    
    with tf.Session() as sess:
        vocab = Counter()
        logging.info('Start generating %s from %d files' % (vocabulary_file, len(file_names)))
        start_time = time.time()
        line_count = 0
        word_count = 0
        while True:
            try:
                word_list=next_line.eval()
                vocab.update(word_list)
                line_count +=1
                word_count += len(word_list)
                if line_count % 100000==0:
                    logging.debug("%d lines and %d words processed" % (line_count, word_count))
            except tf.errors.OutOfRangeError:
                logging.debug("Completed:%d lines and %d words processed" % (line_count, word_count))
                break
        vocab = vocab.most_common(max_vocab_size)
        with  codecs.getreader("utf-8")(tf.gfile.GFile(vocabulary_file, "w")) as vocab_file:
            if markers is True:
                vocab_file.write('<PAD>\n<UNK>\n<SOS>\n<EOS>\n')
            for item in vocab:
                vocab_file.write("{}\n".format(item[0].decode('utf-8')))
        logging.info('Completed generating %s in %d s' % (vocabulary_file, time.time()-start_time))
        sess.close()

In [6]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# Path to corpus of text files
data_dir = '../../data/Gutenberg/txt/'
file_names = [os.path.join(data_dir,file) for file in os.listdir(data_dir) if '.txt' in file]
#Generate vocabulary file
gen_vocab_file(file_names[:100],reset_graph=True)

INFO:root:Start generating ./vocab.txt from 100 files
DEBUG:root:100000 lines and 824751 words processed
DEBUG:root:200000 lines and 1611987 words processed
DEBUG:root:300000 lines and 2456566 words processed
DEBUG:root:400000 lines and 3322237 words processed
DEBUG:root:500000 lines and 4252535 words processed
DEBUG:root:600000 lines and 5182090 words processed
DEBUG:root:Completed:632977 lines and 5476590 words processed
INFO:root:Completed generating ./vocab.txt in 72 s
