In [1]:
import tensorflow as tf
import numpy as np
import tempfile

---
## 1. *make_example*, *parse_example* functionalities

In [2]:
def make_example(sequence, labels):
    """
    create a single tf.train.Example obj instance given a single input sample
    in the form of sequence & labels pair.
    """    
    # # preprocess sequence
    # sequence = preprocess_seq(sequence)
    # convert sequence (string) to list of tokens ex.[string, ..., string]
    tfrecords_features = {}
    tfrecords_features['sequence'] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[sequence.encode('utf-8')]))
    tfrecords_features['labels'] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[np.array(labels).tostring()]))
    return tf.train.Example(features=tf.train.Features(feature=tfrecords_features))

    
def parse_exmp(serial_exmp):
    """
    instructions to parse a single serialized example object, returns the 
    sequence and labels values
    """    
    features = {
        'sequence': tf.FixedLenFeature([], tf.string),
        'labels' : tf.FixedLenFeature([], tf.string)
    }
    feats = tf.parse_single_example(serialized=serial_exmp, 
                                    features=features)
    
    sequence = feats['sequence']
    labels   = tf.decode_raw(feats['labels'], tf.int64)
    return sequence, labels


### 1.1 Example usage of make_example and parse_exmp

In [3]:
# unit tests

# make a single tf example obj
# note: labels
ex = make_example(sequence='test, test, 113',
                  labels=[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# serialize the tf example obj
ex_serial = ex.SerializeToString()
print('Serialized Example:\n', ex_serial)
print()

# parse the serialized example obj
print('Tensors Returned From Parse_exmp:')
sequence, labels = parse_exmp(ex_serial)

# examine obj type
print('sequence: ',sequence)
print('labels: ',labels)
print()

# evaluate each obj
print('Evaluated Tensor Objects:')
with tf.Session() as sess:
    print('sequence: ', sequence.eval())
    print('labels: ', labels.eval())

Serialized Example:
 b'\n\xd5\x06\n\xb1\x06\n\x06labels\x12\xa6\x06\n\xa3\x06\n\xa0\x06\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\

---
## 2. Create a TFRecord file from input dataset

In [7]:
def make_tfrecord(data, outf_nm='my_dataset_2'):
    """
    data is in the format of tuple (sequences, labels), where each sequences and labels
    are list objects of string sequence and label, respectively
    """
    feats, labels = data
    outf_nm += '.tfrecord'
    tfrecord_wrt = tf.python_io.TFRecordWriter(outf_nm)
    n_samples = len(labels)
    for i in range(n_samples):
        exmp = make_example(feats[i], labels[i])
        exmp_serial = exmp.SerializeToString()
        tfrecord_wrt.write(exmp_serial)

In [8]:
# assumed given dataset format

train_sequences = ['the unbreakable “slim neck” replacement head contains wide, fanned bristles to help invigorate gum tissue for reduced chances of bleeding/receding gums and enamel erosion and is best to reach tight spaces. slim, tapered, 100% vegetable based nylon bristles reduce dependency on fossil fuels/petroleum and offer a deep clean and massage of the gumline. replacement heads are bpa-free and available in supersoft, soft, and medium bristle and can be used with the radius source toothbrush and tour travel toothbrush. all radius toothbrushes are manufactured in the usa on low-energy machines, are cruelty free, leaping bunny certified, and are 100% satisfaction guaranteed.',
                   'the source toothbrush utilizes replacement head technology which reduces toothbrush waste by 93%! the unbreakable “slim neck” replacement head contains wide, fanned bristles to help invigorate gum tissue for reduced chances of bleeding/receding gums and enamel erosion and is best to reach tight spaces. the upcycled #5 one-of-a-kind handle contains wood, paper, or money for a beautiful, natural look & feel. reversible right or left handed design for ergonomic brushing helps reduce pressure on teeth and gums while the 100% vegetable based nylon bristles reduce dependency on fossil fuels/petroleum. the source toothbrush & replacement heads are bpa-free and available in supersoft, soft, and medium bristle. all radius toothbrushes are manufactured in the usa on low-energy machines, are cruelty free, leaping bunny certified, and are 100% satisfaction guaranteed.',
                   'this mild foaming cleanser gently removes oil and other residues while oil-free moisturizers keep skin soft, never dry. aloe vera and extracts of cucumber, sea kelp, birch bark and lavender refresh the skin. ph 5.0 / vegan / gluten free.'
                  ]

train_labels = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
               ]

dataset = (train_sequences, train_labels)

# create TFRecord file
make_tfrecord(dataset)

---
## 3. Using the TFRecord file to create TFDataset

In [9]:
# create a tf dataset obj from the TFRecord file
dataset = tf.data.TFRecordDataset('my_dataset_2.tfrecord')

# use dataset.map() in conjunction with the parse_exmp function created earlier
# to de-serialize each example record in TFRecord file
dataset = dataset.map(parse_exmp)

In [10]:
# configure the dataset to set # of epoch, shuffle, and batch size 
epochs = 3
buffer_size = len(train_labels)
batch_size = 2

# configure dataset epoch, shuffle, padding and batching operations
dataset = dataset.repeat(epochs).shuffle(buffer_size).batch(batch_size)

In [11]:
dataset.output_types

(tf.string, tf.int64)

In [12]:
dataset.output_shapes

(TensorShape([Dimension(None)]),
 TensorShape([Dimension(None), Dimension(None)]))

In [16]:
# create a one-shot iterator to parse out one single record example at a time
iterator = dataset.make_one_shot_iterator()
sequence, labels = iterator.get_next()

# evaluate each obj
print('Evaluated Tensor Objects:')
with tf.Session() as sess:
    print('sequence: ', sequence.eval())
    print('labels: ', labels.eval())

Evaluated Tensor Objects:
sequence:  [ b'the unbreakable \xe2\x80\x9cslim neck\xe2\x80\x9d replacement head contains wide, fanned bristles to help invigorate gum tissue for reduced chances of bleeding/receding gums and enamel erosion and is best to reach tight spaces. slim, tapered, 100% vegetable based nylon bristles reduce dependency on fossil fuels/petroleum and offer a deep clean and massage of the gumline. replacement heads are bpa-free and available in supersoft, soft, and medium bristle and can be used with the radius source toothbrush and tour travel toothbrush. all radius toothbrushes are manufactured in the usa on low-energy machines, are cruelty free, leaping bunny certified, and are 100% satisfaction guaranteed.'
 b'the unbreakable \xe2\x80\x9cslim neck\xe2\x80\x9d replacement head contains wide, fanned bristles to help invigorate gum tissue for reduced chances of bleeding/receding gums and enamel erosion and is best to reach tight spaces. slim, tapered, 100% vegetable base