# Word embeddings

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

In [2]:
print(tf.__version__)

2.2.0


In [3]:
embedding_layer = layers.Embedding(1000, 5)

In [4]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[ 0.005047  ,  0.02351889, -0.03519099, -0.0054667 ,  0.00313326],
       [-0.01975456, -0.04510192,  0.02065016,  0.0410821 ,  0.00618936],
       [ 0.01689341,  0.00601308,  0.0071759 , -0.04490727,  0.02445065]],
      dtype=float32)

In [5]:
(train_data, test_data), info = tfds.load(
    'imdb_reviews/subwords8k',
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True,
    as_supervised=True
)



In [6]:
encoder = info.features['text'].encoder
encoder.subwords[:20]

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 'as_']

In [14]:
train_batches = train_data.shuffle(1000).padded_batch(10)
test_batches = test_data.shuffle(1000).padded_batch(10)

In [15]:
train_batch, train_label = next(iter(train_batches))
train_batch.numpy()

array([[3209, 1493,    5, ...,    0,    0,    0],
       [7963,   19, 4829, ...,    0,    0,    0],
       [ 133, 3306,  124, ...,    0,    0,    0],
       ...,
       [ 404,   12,   83, ...,    0,    0,    0],
       [  62,   27,    9, ...,    0,    0,    0],
       [  12,  176, 1037, ...,    0,    0,    0]])

In [17]:
embedding_dim = 16
model = keras.Sequential([
    layers.Embedding(encoder.vocab_size, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1)
])
model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 16)          130960    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total params: 131,249
Trainable params: 131,249
Non-trainable params: 0
_________________________________________________________________


In [18]:
history = model.fit(
    train_batches,
    epochs=10,
    validation_data=test_batches,
    validation_steps=20
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
import matplotlib.pyplot as plt

history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss=history_dict['loss']
val_loss=history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(12,9))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

<Figure size 1200x900 with 1 Axes>

<Figure size 1200x900 with 1 Axes>

In [20]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(8185, 16)


# Transformer model for language understanding

In [21]:
import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt

In [24]:
(train_examples, val_examples), metadata = tfds.load(
    'ted_hrlr_translate/pt_to_en',
    with_info=True,
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    as_supervised=True)


In [26]:
# Custom subword tokenize from training set
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=2**13
)

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt , en in train_examples), target_vocab_size=2**13
)

In [27]:
sample_string = "Transformer is awesome"
tokenized_string = tokenizer_en.encode(sample_string)
print('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print('The origin string: {}'.format(original_string))


Tokenized string is [7915, 1248, 7946, 7194, 13, 2799]
The origin string: Transformer is awesome


In [29]:
for ts in tokenized_string:
    print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

7915 ----> T
1248 ----> ran
7946 ----> s
7194 ----> former 
13 ----> is 
2799 ----> awesome


In [33]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64


In [32]:
def encode(lang1, lang2):
    lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
    lang1.numpy()) + [tokenizer_pt.vocab_size+1]
    
    lang2 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
    lang2.numpy()) + [tokenizer_pt.vocab_size+1]
    
    return lang1, lang2

In [34]:
def tf_encode(pt, en):
    result_pt, result_en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
    result_pt.set_shape([None])
    result_en.set_shape([None])
    
    return result_pt, result_en

In [35]:
MAX_LENGTH = 40

In [36]:
def filter_max_length(x, y, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length,
                         tf.size(y) <= max_length)


In [37]:
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(BATCH_SIZE)

In [38]:
pt_batch, en_batch = next(iter(val_dataset))
pt_batch, en_batch

(<tf.Tensor: shape=(64, 29), dtype=int64, numpy=
 array([[8214,  378,    1, ...,    0,    0,    0],
        [8214, 1293, 1314, ...,    0,    0,    0],
        [8214,   18, 2266, ..., 8055,    2, 8215],
        ...,
        [8214, 1023,  639, ...,    0,    0,    0],
        [8214, 1646,   63, ...,    0,    0,    0],
        [8214,   61,  103, ...,    0,    0,    0]])>,
 <tf.Tensor: shape=(64, 40), dtype=int64, numpy=
 array([[8214, 5148, 8068, ...,    0,    0,    0],
        [8214, 1307, 7990, ...,    0,    0,    0],
        [8214, 3347,   17, ...,    0,    0,    0],
        ...,
        [8214,  203, 2000, ...,    0,    0,    0],
        [8214, 1533, 7990, ...,    0,    0,    0],
        [8214,  708, 7990, ...,    0,    0,    0]])>)

# Fine tuning BERT

In [126]:
!pip install -q tf-nightly
!pip install -q tf-models-nightly

In [135]:
!pip install tf-models-official

Collecting tf-models-official
  Downloading tf_models_official-2.2.1-py2.py3-none-any.whl (711 kB)
[K     |████████████████████████████████| 711 kB 932 kB/s eta 0:00:01
Collecting mlperf-compliance==0.0.10
  Downloading mlperf_compliance-0.0.10-py3-none-any.whl (24 kB)


Installing collected packages: mlperf-compliance, tf-models-official
Successfully installed mlperf-compliance-0.0.10 tf-models-official-2.2.1


In [136]:
import tensorflow as tf
print(tf.__version__)

2.2.0


In [133]:
from tensorflow.keras.layers.experimental import *

In [137]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
print(tf.__version__)
import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

2.2.0


In [138]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

In [140]:
glue, info = tfds.load('glue/mrpc', with_info=True,
                      batch_size=-1)


[1mDownloading and preparing dataset glue/mrpc/1.0.0 (download: 1.43 MiB, generated: Unknown size, total: 1.43 MiB) to /home/user/tensorflow_datasets/glue/mrpc/1.0.0...[0m
Shuffling and writing examples to /home/user/tensorflow_datasets/glue/mrpc/1.0.0.incompleteTPIYAE/glue-train.tfrecord
Shuffling and writing examples to /home/user/tensorflow_datasets/glue/mrpc/1.0.0.incompleteTPIYAE/glue-validation.tfrecord
Shuffling and writing examples to /home/user/tensorflow_datasets/glue/mrpc/1.0.0.incompleteTPIYAE/glue-test.tfrecord
[1mDataset glue downloaded and prepared to /home/user/tensorflow_datasets/glue/mrpc/1.0.0. Subsequent calls will reuse this data.[0m


In [141]:
list(glue.keys())

['test', 'train', 'validation']

In [142]:
info.features

FeaturesDict({
    'idx': tf.int32,
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'sentence1': Text(shape=(), dtype=tf.string),
    'sentence2': Text(shape=(), dtype=tf.string),
})

In [143]:
info.features['label'].names


['not_equivalent', 'equivalent']

In [147]:
glue_train = glue['train']
for key, value in glue_train.items():
    print("{}: {}".format(key, value[0]) )

idx: Tensor("strided_slice_2:0", shape=(), dtype=int32)
label: Tensor("strided_slice_3:0", shape=(), dtype=int64)
sentence1: Tensor("strided_slice_4:0", shape=(), dtype=string)
sentence2: Tensor("strided_slice_5:0", shape=(), dtype=string)


## Bert tokenizer

In [148]:
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(gs_folder_bert, 'vocab.txt'),
    do_lower_case=True
)
print('Vocab size:', len(tokenizer.vocab))

Vocab size: 30522


In [149]:
tokens = tokenizer.tokenize("Hello Tensorflow!")
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

['hello', 'tensor', '##flow', '!']
[7592, 23435, 12314, 999]


# preprocess the data

In [153]:
tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"])

[101, 102]

In [158]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s.numpy()))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

sentence1 = tf.ragged.constant([
    encode_sentence(s) for s in glue_train["sentence1"]])

sentence2 = tf.ragged.constant([
    encode_sentence(s) for s in glue_train["sentence2"]])

OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed in Graph execution. Use Eager execution or decorate this function with @tf.function.