<a href="https://colab.research.google.com/github/CaiWenhui/test/blob/master/tfdata%E5%8A%A0%E8%BD%BD%E6%96%87%E6%9C%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os

In [3]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)

parent_dir = os.path.dirname(text_dir)

parent_dir

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


'/root/.keras/datasets'

In [5]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64)

labeled_data_sets = []

for i,file_name in enumerate(FILE_NAMES):
  line_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
  labeled_dataset = line_dataset.map(lambda ex : labeler(ex,i))
  labeled_data_sets.append(labeled_dataset)

In [6]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [7]:
labeled_data_sets[0]

<MapDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [8]:
labeled_data_sets[1]

<MapDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [9]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)


In [11]:
for ex in all_labeled_data.take(1):
  print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Her spare not. Wound her with thy glittering spear.'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


In [14]:
tokenizer = tfds.deprecated.text.Tokenizer()

vocabulary_set = set()
for text_tensor,_ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)
vocab_size = len(vocabulary_set)
vocab_size

17178

In [15]:
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

In [19]:
example = next(iter(all_labeled_data))[0].numpy()
print(example)

b'Her spare not. Wound her with thy glittering spear.'


In [20]:
encoded_example = encoder.encode(example)
print(encoded_example)

[16261, 16323, 5641, 9287, 2858, 964, 16479, 10069, 8013]


In [21]:
def encode(text_tensor, label):
  return encoder.encode(text_tensor.numpy()),label

def encode_map_fn(text, label):
  encode_text, label = tf.py_function(encode, inp=[text,label], Tout=(tf.int64,tf.int64))
  encode_text.set_shape([None])
  label.set_shape([])
  return encode_text,label

all_encode_data = all_labeled_data.map(encode_map_fn)

In [23]:
train_data = all_encode_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encode_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)

In [24]:
sample_train, sample_label = next(iter(test_data))
sample_train[0], sample_label[0]

(<tf.Tensor: shape=(16,), dtype=int64, numpy=
 array([16261, 16323,  5641,  9287,  2858,   964, 16479, 10069,  8013,
            0,     0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

In [25]:
sample_label

<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([0, 0, 0, 0, 1, 1, 1, 0, 2, 0, 2, 1, 0, 0, 0, 2, 1, 0, 1, 2, 1, 0,
       0, 2, 1, 2, 0, 1, 1, 2, 0, 2, 0, 0, 2, 0, 1, 2, 1, 0, 2, 0, 1, 1,
       0, 1, 0, 1, 2, 2, 1, 1, 0, 1, 1, 1, 2, 1, 1, 2, 1, 0, 2, 0])>

In [26]:
vocab_size+=1

In [35]:
model = tf.keras.Sequential()

In [36]:
model.add(tf.keras.layers.Embedding(vocab_size,64))

In [37]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [38]:
for units in [64, 64]:
  print(units)

64
64


In [39]:
for units in [64,64]:
  model.add(tf.keras.layers.Dense(units, activation = 'relu'))

In [41]:
model.add(tf.keras.layers.Dense(3, activation = 'relu'))

In [46]:
model.compile(optimizer= 'adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [47]:
model.fit(train_data, epochs=3, validation_data = test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f95f00bcf10>

In [48]:
eva_loss,eva_acc = model.evaluate(test_data)

