In [1]:
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
preproces_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"

In [3]:
preprocess_model = hub.KerasLayer(preproces_url)

In [13]:
# create a trivial text to preprocess
text_test = ['this is such an amazing movie!', 'the movie was great!', 'the movie was meh.', 'the movie was okish.', 'the movie was terrible...']
text_preprocessed = preprocess_model(text_test)
text_preprocessed.keys()

dict_keys(['input_word_ids', 'input_mask', 'input_type_ids'])

Input mask is a binary mask that tells the model which tokens in the input sequence are padding tokens and which are real tokens.

Tokens go like this: [CLS] + real tokens + [SEP] + padding tokens

In [14]:
text_preprocessed['input_mask'][0]

<tf.Tensor: shape=(128,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>

In [15]:
text_preprocessed['input_type_ids'][0]

<tf.Tensor: shape=(128,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>

By inspecting word ids, we can see they have an index from a vocabulary list.
Special tokens and their ids:
CLS = [101], SEP = [102], PAD = [0], UNK = [100], MASK = [103]

In [19]:
text_preprocessed['input_word_ids'][1]

<tf.Tensor: shape=(128,), dtype=int32, numpy=
array([ 101, 1996, 3185, 2001, 2307,  999,  102,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])>

In [20]:
bert_model = hub.KerasLayer(encoder_url)
bert_results = bert_model(text_preprocessed)

bert_results.keys()

dict_keys(['sequence_output', 'encoder_outputs', 'pooled_output', 'default'])

Embedding of a sentence:

In [24]:
bert_results['pooled_output'][0]

<tf.Tensor: shape=(768,), dtype=float32, numpy=
array([-0.92169887, -0.39353448, -0.53931653,  0.6825621 ,  0.43848446,
       -0.1402115 ,  0.8774711 ,  0.26043332, -0.6311293 , -0.99996567,
       -0.26319999,  0.8510528 ,  0.9857181 ,  0.1859024 ,  0.9645722 ,
       -0.60019636, -0.21617573, -0.60716844,  0.33550283, -0.55077523,
        0.7181165 ,  0.9997198 ,  0.40006974,  0.270734  ,  0.48017552,
        0.9554252 , -0.7715861 ,  0.9615095 ,  0.95829624,  0.72734815,
       -0.7410953 ,  0.11914763, -0.99118364, -0.2200876 , -0.6818717 ,
       -0.9901061 ,  0.36168098, -0.7948339 ,  0.07143348,  0.09321037,
       -0.9396671 ,  0.14087623,  0.9998697 , -0.34983578,  0.1341332 ,
       -0.335577  , -0.99999964,  0.18722747, -0.8794071 ,  0.67855793,
        0.53289217,  0.4109211 ,  0.16471209,  0.46610358,  0.3736122 ,
       -0.05912915, -0.17985265,  0.02782775, -0.1882211 , -0.5939096 ,
       -0.61234826,  0.27801985, -0.7515826 , -0.9269336 ,  0.69535595,
        0.529581

Embedding of single words:
128 words per sentence, 768 features per word

In [26]:
bert_results['sequence_output'][0].shape

TensorShape([128, 768])

In [27]:
bert_results['sequence_output'][0]

<tf.Tensor: shape=(128, 768), dtype=float32, numpy=
array([[ 0.19451573,  0.25141695,  0.1907506 , ..., -0.24845074,
         0.38568527,  0.1329099 ],
       [-0.5947868 , -0.3942031 ,  0.25245702, ..., -0.769467  ,
         1.1564163 ,  0.32475683],
       [ 0.00641491, -0.15766448,  0.5461023 , ..., -0.17451026,
         0.60289633,  0.4267228 ],
       ...,
       [ 0.2315864 ,  0.04302847,  0.7753699 , ...,  0.06123883,
         0.09081604, -0.1448613 ],
       [ 0.15656534,  0.0381186 ,  0.8351702 , ...,  0.05899133,
         0.04703417, -0.09123503],
       [ 0.18983454,  0.03888706,  0.8360611 , ...,  0.04247585,
         0.02342616, -0.1090318 ]], dtype=float32)>

Since we are using BERT base, we have 12 layers of encoders therefore we will have 12 outputs.

In [29]:
len(bert_results['encoder_outputs'])

12

In [30]:
bert_results['encoder_outputs'][0].shape

TensorShape([5, 128, 768])

Last encoder output is nothing but the sequence output, which is the embedding of the whole sentence.

In [34]:
(bert_results['encoder_outputs'][-1] == bert_results['sequence_output']).numpy().all()

True