In [1]:
import tensorflow as tf
from tensorflow.python.ops import lookup_ops
import numpy as np
import codecs
import time
import logging
import sys
import re
import string
import os
import unicodedata
from collections import Counter

#### Expected file format (Source and Traget translation in one sentence seperated by TAB)

I forgot.	मैं भूल गई।

I'll pay.	मैं पैसे दूंगा।

#### Remove punctuation characters in unicode strings

In [2]:
tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P'))
def remove_punctuation(text):
    return text.translate(tbl)

#### Generate vocabulary from source sentences in files

In [3]:
def _source_preprocess(line):
    '''
    performs the following preprocessing on a line of text.
    - lower case the text
    - remove punctuations
    - try alternate decoding if utf-8 fails
    - encode string as utf-8
    Note this is a pyfunc as tensorflow doesn't seem to have string operations beyond splitting
    
    Returns: a utf-8 encoded processed string
    '''
    try:
        line = line.decode('utf-8')
    except:
        try:
            line = line.decode('iso-8859-1')
        except:
            return line.lower()
    line = line.lower()
    line = line.strip()
    line = remove_punctuation(line)
    return line.encode('utf-8')

def _add_source_vocab_file_generator(text_files):
    '''
    generates a vocabulary file processing block to the current graph. It uses TF 1.2 new Dataset API
    to process a list of files and split it in to tokens required for vocabulary generation
    
    Arguments: 1-D tensor of file names
    Returns: A iterator that returns a list of tokens per line.
    '''
    with tf.name_scope('source_vocab_gen') as scope:
        # create a dataset from list of file names
        dataset = tf.contrib.data.Dataset.from_tensor_slices(tf.constant(text_files))
        # generate a list of lines from files
        dataset = dataset.flat_map(lambda filename: tf.contrib.data.TextLineDataset(filename))
        # split only one part of language
        dataset = dataset.map(lambda line: tf.string_split([line], delimiter='\t').values[0])
        # preprocess each line to lower case, remove punctuation chars
        dataset = dataset.map(lambda line: tf.py_func(_source_preprocess,[line], [tf.string]))
        # basic tokenization - space
        dataset = dataset.map(lambda line: tf.string_split([line]).values)
        # make a one shot iterator
        iterator = dataset.make_one_shot_iterator()
        next_line = iterator.get_next()
        return next_line


def gen_source_vocab_file(file_names, vocabulary_file='./data/source-vocab.txt', max_vocab_size=None,markers = True,
                          reset_graph=False):
    '''
    generates the vocabulary file specified by vocabulary file by iterating through the iterator returned by
    _add_vocab_file_generator.
    
    Arguments:
    file_names - a list of filenames to process
    source_vocabulary_file - the name of output vocabulary file for source
    max_vocab_size - the maximum number of words in the vocabulary. None implies all words will be included, else
                     only the max_vocab_size common words will be included
    markers - add the mandatory <UNK>, <SOS>, <EOS> markers to the beginning of vocab file
    reset_graph - reset current graph before running this function.
    
    Note: Gives some issues on Windows with special characters in file names. Otherwise tested to work well on corpus with
    a million files and up to 4 billion tokens
    '''
    if reset_graph is True:
        tf.reset_default_graph()
    
    next_line = _add_source_vocab_file_generator(file_names)
    
    with tf.Session() as sess:
        vocab = Counter()
        logging.info('Start generating %s from %d files' % (vocabulary_file, len(file_names)))
        start_time = time.time()
        line_count = 0
        word_count = 0
        while True:
            try:
                word_list=next_line.eval()
                vocab.update(word_list)
                line_count +=1
                word_count += len(word_list)
                if line_count % 100000==0:
                    logging.debug("%d lines and %d words processed" % (line_count, word_count))
            except tf.errors.OutOfRangeError:
                logging.debug("Completed:%d lines and %d words processed" % (line_count, word_count))
                break
        vocab = vocab.most_common(max_vocab_size)
        with  codecs.getreader("utf-8")(tf.gfile.GFile(vocabulary_file, "w")) as vocab_file:
            if markers is True:
                vocab_file.write('<PAD>\n<UNK>\n<SOS>\n<EOS>\n')
            for item in vocab:
                vocab_file.write("{}\n".format(item[0].decode('utf-8')))
        logging.info('Completed generating %s in %d s' % (vocabulary_file, time.time()-start_time))
        sess.close()

#### Generate vocabulary from target sentences in files

In [4]:
def _target_preprocess(line):
    '''
    performs the following preprocessing on a line of text.
    - lower case the text
    - remove punctuations
    - try alternate decoding if utf-8 fails
    - encode string as utf-8
    Note this is a pyfunc as tensorflow doesn't seem to have string operations beyond splitting
    
    Returns: a utf-8 encoded processed string
    '''
    try:
        line = line.decode('utf-8')
    except:
        try:
            line = line.decode('iso-8859-1')
        except:
            return line.lower()
    line = line.strip()
    line = remove_punctuation(line)
    return line.encode('utf-8')

def _add_target_vocab_file_generator(text_files):
    '''
    generates a vocabulary file processing block to the current graph. It uses TF 1.2 new Dataset API
    to process a list of files and split it in to tokens required for vocabulary generation
    
    Arguments: 1-D tensor of file names
    Returns: A iterator that returns a list of tokens per line.
    '''
    with tf.name_scope('vocab_gen') as scope:
        # create a dataset from list of file names
        dataset = tf.contrib.data.Dataset.from_tensor_slices(tf.constant(text_files))
        # generate a list of lines from files
        dataset = dataset.flat_map(lambda filename: tf.contrib.data.TextLineDataset(filename))
        # split only one part of language
        dataset = dataset.map(lambda line: tf.string_split([line], delimiter='\t').values[1])
        # preprocess each line to lower case, remove punctuation chars
        dataset = dataset.map(lambda line: tf.py_func(_target_preprocess,[line], [tf.string]))
        # basic tokenization - space
        dataset = dataset.map(lambda line: tf.string_split([line]).values)
        # make a one shot iterator
        iterator = dataset.make_one_shot_iterator()
        next_line = iterator.get_next()
        return next_line


def gen_target_vocab_file(file_names, vocabulary_file='./data/vocab.txt', max_vocab_size=None,markers = True, reset_graph=False):
    '''
    generates the vocabulary file specified by vocabulary file by iterating through the iterator returned by
    _add_vocab_file_generator.
    
    Arguments:
    file_names - a list of filenames to process
    vocabulary_file - the name of output file
    max_vocab_size - the maximum number of words in the vocabulary. None implies all words will be included, else
                     only the max_vocab_size common words will be included
    markers - add the mandatory <UNK>, <SOS>, <EOS> markers to the beginning of vocab file
    reset_graph - reset current graph before running this function.
    
    Note: Gives some issues on Windows with special characters in file names. Otherwise tested to work well on corpus with
    a million files and up to 4 billion tokens
    '''
    if reset_graph is True:
        tf.reset_default_graph()
    
    next_line = _add_target_vocab_file_generator(file_names)
    
    with tf.Session() as sess:
        vocab = Counter()
        logging.info('Start generating %s from %d files' % (vocabulary_file, len(file_names)))
        start_time = time.time()
        line_count = 0
        word_count = 0
        while True:
            try:
                word_list=next_line.eval()
                vocab.update(word_list)
                line_count +=1
                word_count += len(word_list)
                if line_count % 100000==0:
                    logging.debug("%d lines and %d words processed" % (line_count, word_count))
            except tf.errors.OutOfRangeError:
                logging.debug("Completed:%d lines and %d words processed" % (line_count, word_count))
                break
        vocab = vocab.most_common(max_vocab_size)
        with  codecs.getreader("utf-8")(tf.gfile.GFile(vocabulary_file, "w")) as vocab_file:
            if markers is True:
                vocab_file.write('<PAD>\n<UNK>\n<SOS>\n<EOS>\n')
            for item in vocab:
                vocab_file.write("{}\n".format(item[0].decode('utf-8')))
        logging.info('Completed generating %s in %d s' % (vocabulary_file, time.time()-start_time))
        sess.close()

#### Generate Vocabulary files in source and target languages

In [5]:
gen_source_vocab_file(['./data/en-hin.txt'],vocabulary_file='./data/en-vocab.txt',reset_graph=True)
gen_target_vocab_file(['./data/en-hin.txt'],vocabulary_file='./data/hi-vocab.txt',reset_graph=True)

#### Create datasets in both languages

In [6]:
def create_source_dataset(batch_size, text_files, vocab_file, reset_graph=False):
    '''
    generate a padded sequence of data from source text files. Each line of the file
    will be converted to a padded sequence of integers coded by vocabulary and length
    of the sentence i.e
    'Hello World there','Bye there' -> ([12, 10, 15],3),([21, 15, 0],2)
    Arguments:
    ----------
    batch_size - size of batch requested
    text_files - list of file names of text files containing data
    vocab_file - path to source vocabulary file containing the vocabulary for translating words
                 to ids. This file must have the format
                 <PAD>
                 <UNK>
                 <SOS>
                 <EOS>
                 word1
                 word2
                 ....
    reset_graph - [optional], reset the graph
    
    Returns:
    --------
    iterator - to generate a batch of data (of size batch size) of 
               (sentence ids, sentence length tuples) i.e
               ([[12,10,15],[21,15,0]], [3,2])
    vocab_table - tensorflow hashtable containing word to index mapping
    reverse_vocab_table - tensorflow reverse hashtable containing index to word mapping
    
    '''
    if reset_graph is True:
        tf.reset_default_graph()
    table = lookup_ops.index_table_from_file(vocab_file, num_oov_buckets=0,
                                             default_value=1)
    reverse_table = lookup_ops.index_to_string_table_from_file(vocab_file)
    dataset = tf.contrib.data.Dataset.from_tensor_slices(tf.constant(text_files))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.flat_map(lambda filename: tf.contrib.data.TextLineDataset(filename))
    dataset = dataset.map(lambda line: tf.string_split([line], delimiter='\t').values[0])
    dataset = dataset.map(lambda line: tf.py_func(_source_preprocess,[line], [tf.string]))
    dataset = dataset.map(lambda line: tf.string_split([line]).values)
    dataset = dataset.map(lambda words: (table.lookup(words), tf.size(words)))
    pad_value = tf.cast(table.lookup(tf.constant('<PAD>')),dtype=tf.int64)
    dataset = dataset.padded_batch(batch_size,padded_shapes=(tf.TensorShape([None]),
                                                             tf.TensorShape([])),
                                                               padding_values=(pad_value,0))
    return dataset, table, reverse_table
    
    
def create_target_dataset(batch_size, text_files, vocab_file, reset_graph=False):
    '''
    generate a padded sequence of data from target text files. Each line of the file
    will be converted to a padded sequence of integers coded by target vocabulary and length
    of the sentence. Note However for decoder purposes, there will be 2 dataset tuples returned
    target_in <- inputs to the decoder with <SOS> marker prefixed to sentence
    target_out <- outputs of decoder with <EOS> market suffixed to sentence
    'बचाओ','मैं ठीक हूँ' -> target_in:  ([2,12, 10, 15],4),([2,21, 15, 0],3)
                         target_out:  ([12, 10, 15,3],4),([21, 15,3, 0],3)
    Arguments:
    ----------
    batch_size - size of batch requested
    text_files - list of file names of text files containing data
    vocab_file - path to target vocabulary file containing the vocabulary for translating words
                 to ids. This file must have the format
                 <PAD>
                 <UNK>
                 <SOS>
                 <EOS>
                 word1
                 word2
                 ....
    reset_graph - [optional], reset the graph
    
    Returns:
    --------
    iterator - to generate a batch of data (of size batch size) of 
               (in_sentence ids, in_sentence length tuples) and (out_sentence ids, out_sentence length tuples)
               i.e
               ([[2,12,10,15],[2,21,15,0]], [4,3]),([[12,10,15,3],[21,15,3,0]], [4,3])
               
    vocab_table - tensorflow hashtable containing word to index mapping
    reverse_vocab_table - tensorflow reverse hashtable containing index to word mapping
    
    '''
    if reset_graph is True:
        tf.reset_default_graph()
    table = lookup_ops.index_table_from_file(vocab_file, num_oov_buckets=0,
                                             default_value=1)
    reverse_table = lookup_ops.index_to_string_table_from_file(vocab_file)
    dataset = tf.contrib.data.Dataset.from_tensor_slices(tf.constant(text_files))
    dataset = dataset.shuffle(buffer_size=100)
    dataset = dataset.flat_map(lambda filename: tf.contrib.data.TextLineDataset(filename))
    dataset = dataset.map(lambda line: tf.string_split([line], delimiter='\t').values[1])
    dataset = dataset.map(lambda line: tf.py_func(_source_preprocess,[line], [tf.string]))
    dataset = dataset.map(lambda line: tf.string_split([line]).values)
    dataset = dataset.map(lambda words: (table.lookup(words), tf.size(words)))
    pad_value = tf.cast(table.lookup(tf.constant('<PAD>')),dtype=tf.int64)
    start_value = tf.cast(table.lookup(tf.constant('<SOS>')),dtype=tf.int64)
    end_value = tf.cast(table.lookup(tf.constant('<EOS>')),dtype=tf.int64)
    dataset_in = dataset.map(lambda words, word_len: (tf.concat(([start_value],words),0), word_len+1))
    dataset_out = dataset.map(lambda words, word_len: (tf.concat((words,[end_value]),0), word_len+1))
    dataset_in = dataset_in.padded_batch(batch_size,padded_shapes=(tf.TensorShape([None]),
                                                             tf.TensorShape([])),
                                                               padding_values=(pad_value,0))
    dataset_out = dataset_out.padded_batch(batch_size,padded_shapes=(tf.TensorShape([None]),
                                                             tf.TensorShape([])),
                                                               padding_values=(pad_value,0))
    return dataset_in,dataset_out, table, reverse_table 
    

#### Test Implementations

In [7]:
source_dataset, source_table, source_reverse_table = create_source_dataset(5,['./data/en-hin.txt'],
                                                                           vocab_file='./data/en-vocab.txt')
target_dataset_in, target_dataset_out, target_table, target_reverse_table = \
                                                create_target_dataset(5,['./data/en-hin.txt'],
                                                                      vocab_file='./data/hi-vocab.txt')
source_iterator = source_dataset.make_initializable_iterator()
target_in_iterator = target_dataset_in.make_initializable_iterator()
target_out_iterator = target_dataset_out.make_initializable_iterator()

In [8]:
sess = tf.InteractiveSession()
sess.run(tf.tables_initializer())
sess.run(source_iterator.initializer)
sess.run(target_in_iterator.initializer)
sess.run(target_out_iterator.initializer)

In [9]:
sent, sent_len = sess.run(source_iterator.get_next())
print(sent)
print(sent_len)

[[ 93   0]
 [898   0]
 [898   0]
 [132  14]
 [ 41 899]]
[1 1 1 2 2]


In [10]:
source_str = [x for x in source_reverse_table.lookup(tf.constant(sent, dtype=tf.int64)).eval().tolist()]
for i in range(len(source_str)):
    print([x.decode('utf-8') for x in source_str[i]])

['help', '<PAD>']
['hello', '<PAD>']
['hello', '<PAD>']
['got', 'it']
['im', 'ok']


In [11]:
sent, sent_len = sess.run(target_in_iterator.get_next())
print(sent)
print(sent_len)

[[   2 1423    0    0]
 [   2 1424    0    0]
 [   2 1425    0    0]
 [   2  961   30    6]
 [   2    7  163   18]]
[2 2 2 4 4]


In [12]:
#verify reverse lookup works
target_str = [x for x in target_reverse_table.lookup(tf.constant(sent, dtype=tf.int64)).eval().tolist()]
for i in range(len(target_str)):
    print([x.decode('utf-8') for x in target_str[i]])

['<SOS>', 'बचाओ', '<PAD>', '<PAD>']
['<SOS>', 'नमस्ते', '<PAD>', '<PAD>']
['<SOS>', 'नमस्कार', '<PAD>', '<PAD>']
['<SOS>', 'समझे', 'कि', 'नहीं']
['<SOS>', 'मैं', 'ठीक', 'हूँ']


In [13]:
sent, sent_len = sess.run(target_out_iterator.get_next())
print(sent)
print(sent_len)

[[1423    3    0    0]
 [1424    3    0    0]
 [1425    3    0    0]
 [ 961   30    6    3]
 [   7  163   18    3]]
[2 2 2 4 4]


In [14]:
target_str = [x for x in target_reverse_table.lookup(tf.constant(sent, dtype=tf.int64)).eval().tolist()]
for i in range(len(target_str)):
    print([x.decode('utf-8') for x in target_str[i]])

['बचाओ', '<EOS>', '<PAD>', '<PAD>']
['नमस्ते', '<EOS>', '<PAD>', '<PAD>']
['नमस्कार', '<EOS>', '<PAD>', '<PAD>']
['समझे', 'कि', 'नहीं', '<EOS>']
['मैं', 'ठीक', 'हूँ', '<EOS>']
