In [1]:
import tensorflow as tf
import numpy as np
import tempfile

---
## 1. SequenceExample - *make_example*, *parse_example* functionalities

In [2]:
def make_example(sequence, labels):
    """
    create a single tf.train.SequenceExample obj instance given a single input sample
    in the form of sequence & labels pair.
    
    tf.train.SequenceExample consists of SequenceExample.context & SequenceExample.feature_lists
    to distinguish between non-sequential and sequential features
    """
    # init the tf.example object
    ex = tf.train.SequenceExample()
    
    # # preprocess sequence
    # sequence = preprocess_seq(sequence)
    # convert sequence (string) to list of tokens ex.[string, ..., string]
    sequence = sequence.split()
    sequence_length = len(sequence)
    
    # add to context (non-sequential) features
    ex.context.feature["seq_length"].int64_list.value.append(sequence_length)
    ex.context.feature["labels"].bytes_list.value.append(np.array(labels).tostring())
        
    # add to sequential features
    fl_tokens = ex.feature_lists.feature_list["tokens"]
    # Populate the tokens in sequence one-by-one
    for token in sequence:
        fl_tokens.feature.add().bytes_list.value.append(token.encode('utf-8'))
    return ex


def parse_exmp(serial_exmp):
    """
    instructions to parse a single serialized example object, returns the 
    seq_length, labels, and tokens values
    """
    context_features = {
        'seq_length': tf.FixedLenFeature([], tf.int64),
        'labels': tf.FixedLenFeature([], tf.string)
    }
    sequence_features = {
        # KEYPOINT: using tf.VarLenFeature here because our string sequence consist of
        #           varying number of tokens
        'tokens': tf.VarLenFeature(tf.string)
        
        # # NOTE: Alternative, one could use tf.FixedLenSequenceFeature if some 
        # # preprocessing step is done to pre-pad the sequence to ensure fixed sequenc 
        # # length. Downside of this is excessive memory used to store the paddings.
        # 'tokens': tf.FixedLenSequenceFeature([], tf.string)
    }
    
    context_ft_parsed, sequence_ft_parsed = tf.parse_single_sequence_example(
        serialized=serial_exmp,
        context_features=context_features,
        sequence_features=sequence_features
    )
    
    seq_length = context_ft_parsed['seq_length']
    label = tf.decode_raw(context_ft_parsed['labels'], tf.int64)

    # the output of VarLenFeature is a sparse_tensor, so we need to convert it to dense tensor
    # note that default value of default_value is integer 0, since our tokens are of tf.string
    # dtype, we need to specify custom string value to fill in the missing entries in the sparse
    # tensor
    tokens = tf.sparse_tensor_to_dense(sp_input=sequence_ft_parsed['tokens'],
                                       default_value='')
    tokens = tf.reshape(tokens, shape=(-1,))
    tokens = tf.transpose(tokens)
    
    return tokens, label, seq_length


### 1.1 Example usage of make_example and parse_exmp

In [3]:
# unit tests

# make a single tf example obj
# note: labels
ex = make_example(sequence='test, test, 113',
                  labels=[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# serialize the tf example obj
ex_serial = ex.SerializeToString()
print('Serialized Example:\n', ex_serial)
print()

# parse the serialized example obj
print('Tensors Returned From Parse_exmp:')
tokens, label, seq_length = parse_exmp(ex_serial)

# examine obj type
print('tokens: ',tokens)
print('label: ',label)
print('seq_length: ',seq_length)
print()


# evaluate each obj
print('Evaluated Tensor Objects:')
with tf.Session() as sess:
    print('tokens: ', tokens.eval())
    print('label: ', label.eval())
    print('sequence length: ', seq_length.eval())

Serialized Example:
 b'\n\xc9\x06\n\x13\n\nseq_length\x12\x05\x1a\x03\n\x01\x03\n\xb1\x06\n\x06labels\x12\xa6\x06\n\xa3\x06\n\xa0\x06\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0

---
## 2. Create a TFRecord file from input dataset

In [4]:
def make_tfrecord(data, outf_nm='my_dataset'):
    """
    data is in the format of tuple (sequences, labels), where each sequences and labels
    are list objects of string sequence and label, respectively
    """
    feats, labels = data
    outf_nm += '.tfrecord'
    tfrecord_wrt = tf.python_io.TFRecordWriter(outf_nm)
    n_samples = len(labels)
    for i in range(n_samples):
        exmp = make_example(feats[i], labels[i])
        exmp_serial = exmp.SerializeToString()
        tfrecord_wrt.write(exmp_serial)

In [5]:
# assumed given dataset format

train_sequences = ['the unbreakable “slim neck” replacement head contains wide, fanned bristles to help invigorate gum tissue for reduced chances of bleeding/receding gums and enamel erosion and is best to reach tight spaces. slim, tapered, 100% vegetable based nylon bristles reduce dependency on fossil fuels/petroleum and offer a deep clean and massage of the gumline. replacement heads are bpa-free and available in supersoft, soft, and medium bristle and can be used with the radius source toothbrush and tour travel toothbrush. all radius toothbrushes are manufactured in the usa on low-energy machines, are cruelty free, leaping bunny certified, and are 100% satisfaction guaranteed.',
                   'the source toothbrush utilizes replacement head technology which reduces toothbrush waste by 93%! the unbreakable “slim neck” replacement head contains wide, fanned bristles to help invigorate gum tissue for reduced chances of bleeding/receding gums and enamel erosion and is best to reach tight spaces. the upcycled #5 one-of-a-kind handle contains wood, paper, or money for a beautiful, natural look & feel. reversible right or left handed design for ergonomic brushing helps reduce pressure on teeth and gums while the 100% vegetable based nylon bristles reduce dependency on fossil fuels/petroleum. the source toothbrush & replacement heads are bpa-free and available in supersoft, soft, and medium bristle. all radius toothbrushes are manufactured in the usa on low-energy machines, are cruelty free, leaping bunny certified, and are 100% satisfaction guaranteed.',
                   'this mild foaming cleanser gently removes oil and other residues while oil-free moisturizers keep skin soft, never dry. aloe vera and extracts of cucumber, sea kelp, birch bark and lavender refresh the skin. ph 5.0 / vegan / gluten free.'
                  ]

train_labels = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
               ]

dataset = (train_sequences, train_labels)

# create TFRecord file
make_tfrecord(dataset)

---
## 3. Using the TFRecord file to create TFDataset

In [6]:
# create a tf dataset obj from the TFRecord file
dataset = tf.data.TFRecordDataset('my_dataset.tfrecord')

# use dataset.map() in conjunction with the parse_exmp function created earlier
# to de-serialize each example record in TFRecord file
dataset = dataset.map(parse_exmp)

In [7]:
# configure the dataset to set # of epoch, shuffle, and batch size 
epochs = 3
buffer_size = len(train_labels)
batch_size = 2

# KEYPOINT: 
# 1 - our dataset consists of varying sequence length and we opt for dynamic padding using dataset.padded_batch 
# 2 - we specify the manner of padding by using variable "padded_shapes", which takes the format:
#     [] if the member to parse is of type scalar
#     [max_length] if the member to parse is of type list
#     [d1,...,dn] if the member is of type array (where d1 specifies first dimension)
#     NOTE: the order of the type of padding need to match with the order of the returned data members from
#           parse_exmp
padded_shapes = ([500],[100],[])

# configure dataset epoch, shuffle, padding and batching operations
dataset = dataset.repeat(epochs).shuffle(buffer_size).padded_batch(batch_size, padded_shapes=padded_shapes)

In [8]:
dataset.output_types

(tf.string, tf.int64, tf.int64)

In [9]:
dataset.output_shapes

(TensorShape([Dimension(None), Dimension(500)]),
 TensorShape([Dimension(None), Dimension(100)]),
 TensorShape([Dimension(None)]))

In [10]:
# create a one-shot iterator to parse out one single record example at a time
iterator = dataset.make_one_shot_iterator()
tokens, label, seq_length = iterator.get_next()

# evaluate each obj
print('Evaluated Tensor Objects:')
with tf.Session() as sess:
    print('tokens: ', tokens.eval())
    print('label: ', label.eval())
    print('sequence length: ', seq_length.eval())

Evaluated Tensor Objects:
tokens:  [[b'this' b'mild' b'foaming' b'cleanser' b'gently' b'removes' b'oil' b'and'
  b'other' b'residues' b'while' b'oil-free' b'moisturizers' b'keep' b'skin'
  b'soft,' b'never' b'dry.' b'aloe' b'vera' b'and' b'extracts' b'of'
  b'cucumber,' b'sea' b'kelp,' b'birch' b'bark' b'and' b'lavender'
  b'refresh' b'the' b'skin.' b'ph' b'5.0' b'/' b'vegan' b'/' b'gluten'
  b'free.' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b''
  b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b''
  b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b''
  b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b''
  b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b''
  b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b''
  b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b''
  b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b'' b''
  b'' b'' b''