### What is BERT? | Deep Learning Tutorial 46 (Tensorflow, Keras & Python)
https://www.youtube.com/watch?v=7kLi8u2dJz0

In [None]:
!pip install tensorflow-text

In [2]:
import tensorflow_hub as hub
import tensorflow_text as text

In [3]:
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'

In [4]:
bert_preprocess_model = hub.KerasLayer(preprocess_url)

In [5]:
text_test = ['nice movie indeed', 'I love python programming']
text_preprocessed = bert_preprocess_model(text_test)
text_preprocessed.keys()

dict_keys(['input_word_ids', 'input_type_ids', 'input_mask'])

In [10]:
text_preprocessed

{'input_mask': <tf.Tensor: shape=(2, 128), dtype=int32, numpy=
 array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       dtype=int32)>,
 'input_type_ids': <tf.Tensor: shape

In [11]:
bert_model = hub.KerasLayer(encoder_url)

In [12]:
bert_results = bert_model(text_preprocessed)
bert_results.keys()

dict_keys(['encoder_outputs', 'sequence_output', 'default', 'pooled_output'])

In [14]:
print(bert_results['pooled_output'])    # embedding for the entire two sentences. (2, 768) = (number of sentences, embedding vector size)
                                        # 첫번째 row는 'nice movie indeed'
                                        # 두번째 row는 'I love python programming'
                                        # pooled output is the embedding of [CLS] token. In general, people use pooled_output of the sentence and use it for text classification

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.79177415, -0.21411918,  0.49769545, ...,  0.24465223,
        -0.47334486,  0.81758714],
       [-0.9171231 , -0.4793519 , -0.78656995, ..., -0.6175179 ,
        -0.7102687 ,  0.921843  ]], dtype=float32)>

In [18]:
print("shape:", bert_results['sequence_output'].shape)  # (2, 128, 768) = (number of sentences, length of the sentence including padding, embedding vector size)
                                                        # for example, 'nice movie indeed' is converted like: <start> <nice> <movie> <indeed> <end> 0 0 0 0 0 0 ...
                                                        # After that <start> token is converted to an array. The size of the array will be 768. So basically, there are 128 tokens and each token could be represented with 768 vectors.
print(bert_results['sequence_output'])

shape: (2, 128, 768)
tf.Tensor(
[[[ 0.07292046  0.08567826  0.14476815 ... -0.09677101  0.08722129
    0.07711115]
  [ 0.17839423 -0.19006102  0.50349426 ... -0.0586982   0.32717055
   -0.1557847 ]
  [ 0.18701506 -0.43388772 -0.48875096 ... -0.15502794  0.00145145
   -0.24470927]
  ...
  [ 0.12083083  0.12884235  0.46453506 ...  0.07375502  0.17441955
    0.16522071]
  [ 0.07967877 -0.01190688  0.5022537  ...  0.13777734  0.2100222
    0.00624607]
  [-0.07212669 -0.2830346   0.59033346 ...  0.4755189   0.16668485
   -0.08920346]]

 [[-0.07900573  0.36335135 -0.21101598 ... -0.17183757  0.1629974
    0.6724264 ]
  [ 0.27883524  0.43716326 -0.3576475  ... -0.04463673  0.38315123
    0.5887984 ]
  [ 1.2037673   1.0727024   0.48408753 ...  0.24920987  0.40730917
    0.40481845]
  ...
  [ 0.0863004   0.19353823  0.47540024 ...  0.18880165 -0.06474157
    0.31318584]
  [ 0.15887067  0.28572673  0.37340784 ...  0.09309104 -0.04969589
    0.38761133]
  [-0.08079871 -0.09572829  0.26809764 ... 

In [27]:
print("Length of encoder_outputs:", len(bert_results['encoder_outputs']))   # it is 12 because we are using BERT base model. 12 indicates that we are using 12 attentions
print()
print(bert_results['encoder_outputs'])

Length of encoder_outputs: 12

[<tf.Tensor: shape=(2, 128, 768), dtype=float32, numpy=
array([[[ 0.12901431,  0.00644744, -0.03614963, ...,  0.04999617,
          0.06149197, -0.02657544],
        [ 1.1753383 ,  1.2140785 ,  1.1569982 , ...,  0.11634368,
         -0.35855377, -0.4049018 ],
        [ 0.03859037,  0.5386998 , -0.21089777, ...,  0.21858189,
          0.7260167 , -1.1158606 ],
        ...,
        [-0.07587016, -0.25421906,  0.70755124, ...,  0.5054201 ,
         -0.18878679,  0.1502834 ],
        [-0.16066605, -0.28089684,  0.5759707 , ...,  0.5275854 ,
         -0.11141382,  0.02887549],
        [-0.04428151, -0.2027958 ,  0.59093547, ...,  0.8133835 ,
         -0.39075807, -0.02601741]],

       [[ 0.1890359 ,  0.02752547, -0.06513736, ..., -0.00620206,
          0.15053889,  0.03165447],
        [ 0.5916149 ,  0.7589138 , -0.0724067 , ...,  0.61903965,
          0.82928896,  0.1616197 ],
        [ 1.4460827 ,  0.44602662,  0.40990263, ...,  0.4825589 ,
          0.6269

In [34]:
bert_results['encoder_outputs'][-1] == bert_results['sequence_output']  # the last output of the encoder_outputs is exactly same as the sequence_output.

<tf.Tensor: shape=(2, 128, 768), dtype=bool, numpy=
array([[[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],

       [[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]]])>

In [35]:
bert_results['default']

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.79177415, -0.21411918,  0.49769545, ...,  0.24465223,
        -0.47334486,  0.81758714],
       [-0.9171231 , -0.4793519 , -0.78656995, ..., -0.6175179 ,
        -0.7102687 ,  0.921843  ]], dtype=float32)>

In [37]:
bert_results['pooled_output']

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.79177415, -0.21411918,  0.49769545, ...,  0.24465223,
        -0.47334486,  0.81758714],
       [-0.9171231 , -0.4793519 , -0.78656995, ..., -0.6175179 ,
        -0.7102687 ,  0.921843  ]], dtype=float32)>