In [None]:
# install huggingface Transformers [https://huggingface.co/transformers/installation.html]

# Many transformer based models in a single library: https://github.com/huggingface/transformers#model-architectures
! pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

In [None]:
# Reference: https://medium.com/tensorflow/using-tensorflow-2-for-state-of-the-art-natural-language-processing-102445cda54a
# Ref: https://huggingface.co/transformers/notebooks.html 
# https://huggingface.co/docs/transformers/training

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

2.7.0


## Tokenization

* At the end of the day for any NLP task we have to tokenize the data.

In [None]:
# Tokenization: map words to ids
# Refer: https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb#scrollTo=LgktNYt7ADPS

# simple example

s           = "very very long corpus..."
words       = s.split(" ")  # Split over space
vocabulary  = dict(enumerate(set(words)))  # Map storing the word to it's corresponding id

print(vocabulary)

# Problems: cat(1123) vs cats(1346)

{0: 'long', 1: 'very', 2: 'corpus...'}


* so for each indviual word we have associate a numeric value.

### Sub-tokenization

- Why? : fast vs faster, cat vs cats ,

- In simple tokenization scheme fast and faster are considered as different words. Hence fast and faster will have different index(numeric value assiciated to them).so here numeric values are not telling us that fast and faster are the same words with different suffixes.However we can also do more pre-processing and remove the suffix by converting the words to their root form(yusing stemmers).But typically in Bert and TRansformer we use Sub-tokenization.

- example: cats --**bold text**> [cat, ##s] 

* so first of all **fast** will be assigned a numerical value but it will create two indices for the words **faster** [fast] and [##er].Thus a single word **faster** is now broken into two words.These two **##** means thi given word has to be concatenated with previous word to get the actual word.

* similarly if have quick and quicker then **quick** will have some numeric value and **quicker** will be divided into [quick] and [##er]. The advantage here is that numeric value assigned to [##er] from faster and [##er] from quicker will be same and moreover value assigned to quick from 'quick' and quick from 'quicker' will also be the same.

- Image: https://nlp.fast.ai/images/multifit_vocabularies.png

<img src = "https://nlp.fast.ai/images/multifit_vocabularies.png" alt = 
"Smiley face" height="75%" width="75%">


### Tokenization in huggingface


In [None]:
from transformers import BertTokenizer

# https://huggingface.co/docs/transformers/model_doc/bert
# we are using pre-trained bert tokenizer

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased") # cased means upper case and lowercased words are treated different.
                                                                  # Uncased means it is case insensitive
# Every Bert/Transformer Model has it's own tokenizer.

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Refer BERT architecture from the previous videos in the course.

#https://huggingface.co/transformers/main_classes/tokenizer.html

print(bert_tokenizer.cls_token)
print(bert_tokenizer.sep_token)
print(bert_tokenizer.mask_token)
print(bert_tokenizer.pad_token) 
# If input is not 512 words then we pad it 

# remember we always need to give [cls] as our first word in the input.

[CLS]
[SEP]
[MASK]
[PAD]


In [None]:
Input_String = "Hi, I am James bond 007 !"

enc = bert_tokenizer.encode(Input_String)

print(enc,len(enc)) # so enc is encoded equivalent of the above string.

print(bert_tokenizer.decode(enc))
# There are two extra symbols encoded also [CLS] and [SEP]

[101, 8790, 117, 146, 1821, 1600, 7069, 3135, 1559, 106, 102] 11
[CLS] Hi, I am James bond 007! [SEP]


In [None]:
print(bert_tokenizer.decode([146]))
print(bert_tokenizer.decode([106]))

I
!


In [None]:
enc = bert_tokenizer.encode("I see many cats and dogs",return_tensors = "tf")
print(enc,type(enc))

# Here cats is not broken into 2 words.

tf.Tensor([[  101   146  1267  1242 11771  1105  6363   102]], shape=(1, 8), dtype=int32) <class 'tensorflow.python.framework.ops.EagerTensor'>


In [None]:
enc = bert_tokenizer.encode("Dope Money comes fast and goes faster")
print(enc)

print(bert_tokenizer.decode(enc))

# Here faster is not broken into 2 words.As every tokenizer is implemented differently based on what that model expects.

[101, 2091, 3186, 8948, 2502, 2698, 1105, 2947, 4946, 102]
[CLS] Dope Money comes fast and goes faster [SEP]


## BERT Models
- DistillBERT
- RoBERTa
- https://miro.medium.com/max/2000/1*IFVX74cEe8U5D1GveL1uZA.png 
<img src="https://miro.medium.com/max/2000/1*IFVX74cEe8U5D1GveL1uZA.png " alt="Smiley face" height="75%" width="75%">

- Y-axis contains number of parameter in Millions

**which Bert Model to chose?**

- https://miro.medium.com/max/1400/1*bSUO_Qib4te1xQmBlQjWaw.png

<img src="https://miro.medium.com/max/1400/1*bSUO_Qib4te1xQmBlQjWaw.png " alt="Smiley face" height="75%" width="75%">

- General Language Understanding Evaluation (GLUE)  : https://gluebenchmark.com/


* So DistilBert is more common these days 
* However with one line code change we can use all above models.

In [None]:
import tensorflow as tf

# Refer: https://huggingface.co/transformers/model_doc/distilbert.html#

from transformers import DistilBertTokenizer, TFDistilBertModel
# Anything starts with D are pytorch Implementations and anything starts with TF is in TensorFlow generally.

distil_bert = 'distilbert-base-uncased' # Name of the pretrained models

# DistilBERT 

tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)
model     = TFDistilBertModel.from_pretrained(distil_bert) # we want to write code using TensorFlow.

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
print(tokenizer.cls_token)

[CLS]


In [None]:
enc = tokenizer.encode("I see many cats and dogs")
print(enc,type(enc))

print(tokenizer.decode(enc))

[101, 1045, 2156, 2116, 8870, 1998, 6077, 102] <class 'list'>
[CLS] i see many cats and dogs [SEP]


### Extract features using BERT

In [None]:
# obtain the 768-dim vector correpsoding to [CLS] which is a sentence vector,watch after 50:00

e = tokenizer.encode("Hello, my dog is cute")
print(e) # List 

# Anything that we pass to DL models is in Batches ,so Input that we have till now we are converting it into TF constants
input = tf.constant(e)[None, :]  # Batch size 1 
# first converting the list into TF constants and on it we are concatenating operation [None,:], where None is batch-size and : means we are 
                                                                                            # taking all the values of list e.
print(input) # Input is TF tensor .[1,8] means we have only one sentence which has 8 tokens.
print(type(input)) # shape: [1,8], special tensor called EagerTensor

# Eager execution means whatever we pass it will be executed without waiting for the computatinal graph to be built and evaluated.

output = model(input)

print("\n Type of the Output",type(output))   # Output is a Tuple ??.
print("\n Length of the Output",len(output))  # There is only one element

print("\n",output) # Th output of Input [1,8] has shape [1,8,768]-- 1 means first sentence of the batch, 8 means number of tokens ,768 means 
# for every input token we have 768 dimensional vector. 

[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]
tf.Tensor([[  101  7592  1010  2026  3899  2003 10140   102]], shape=(1, 8), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>

 Type of the Output <class 'transformers.modeling_tf_outputs.TFBaseModelOutput'>

 Length of the Output 2

 TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 8, 768), dtype=float32, numpy=
array([[[-1.8296386e-01, -7.4054033e-02,  5.0267726e-02, ...,
         -1.1260688e-01,  4.4493079e-01,  4.0941307e-01],
        [ 7.0607476e-04,  1.4825349e-01,  3.4328306e-01, ...,
         -8.6039692e-02,  6.9474775e-01,  4.3352805e-02],
        [-5.0720620e-01,  5.3085524e-01,  3.7162673e-01, ...,
         -5.6287491e-01,  1.3755690e-01,  2.8475243e-01],
        ...,
        [-4.2251337e-01,  5.7314694e-02,  2.4338314e-01, ...,
         -1.5222691e-01,  2.4462417e-01,  6.4154840e-01],
        [-4.9384442e-01, -1.8895462e-01,  1.2640835e-01, ...,
          6.3240513e-02,  3.6912850e-01, -5.8251895e-02

* Now we want 768 dimensional vector corresponds to [CLS] refer:- Theory 

In [None]:
output[0].shape # Output[0] is the first entry of the returned tuple

TensorShape([1, 8, 768])

In [None]:
output[0]

<tf.Tensor: shape=(1, 8, 768), dtype=float32, numpy=
array([[[-1.8296386e-01, -7.4054033e-02,  5.0267726e-02, ...,
         -1.1260688e-01,  4.4493079e-01,  4.0941307e-01],
        [ 7.0607476e-04,  1.4825349e-01,  3.4328306e-01, ...,
         -8.6039692e-02,  6.9474775e-01,  4.3352805e-02],
        [-5.0720620e-01,  5.3085524e-01,  3.7162673e-01, ...,
         -5.6287491e-01,  1.3755690e-01,  2.8475243e-01],
        ...,
        [-4.2251337e-01,  5.7314694e-02,  2.4338314e-01, ...,
         -1.5222691e-01,  2.4462417e-01,  6.4154840e-01],
        [-4.9384442e-01, -1.8895462e-01,  1.2640835e-01, ...,
          6.3240513e-02,  3.6912850e-01, -5.8251895e-02],
        [ 8.3268648e-01,  2.4948204e-01, -4.5439535e-01, ...,
          1.1997566e-01, -3.9257306e-01, -2.7785379e-01]]], dtype=float32)>

In [None]:
# [CLS] corresponding vector
print((output[0])[0,0,:])  # shape: 768 dim vector,we want from 0th sentence the 0th word[0th word is [cls]] and 
#                                                                                 : all of the sentences. Output shape = tuple([1,8,768],array)
# we know [CLS] is always the First Token of the Input .
# since Tensor hence 3-D

tf.Tensor(
[-1.82963863e-01 -7.40540326e-02  5.02677262e-02 -3.49530488e-01
 -7.28532374e-02 -2.63872653e-01  2.39293322e-01  4.79842007e-01
 -2.14802459e-01 -1.89516261e-01  8.99826735e-02 -1.29189000e-01
 -1.11275926e-01  3.16634446e-01 -8.25904980e-02  9.26226079e-02
 -2.09083185e-02  4.74876046e-01  1.28833592e-01  3.18717025e-03
 -1.53505713e-01 -3.57001752e-01  9.89284366e-04 -3.92748881e-03
  1.38443913e-02 -5.49409837e-02  8.45261812e-02  1.36564448e-01
  2.18252301e-01 -1.96798846e-01  2.47994587e-02  1.75569281e-01
 -3.97216901e-02 -1.10777110e-01  5.48525862e-02  6.07531480e-02
  1.71999596e-02 -1.07415289e-01 -8.76946598e-02  2.12042004e-01
 -4.05892953e-02 -3.17959487e-02  1.37656942e-01 -1.39004648e-01
 -4.68861684e-03 -3.97633225e-01 -2.60034585e+00 -1.08741723e-01
  4.86708432e-02 -3.61387491e-01  3.71814519e-01 -7.61094987e-02
  3.23910490e-02  2.31666192e-01  2.63016075e-01  3.18299681e-01
 -3.87970477e-01  2.98111022e-01 -4.93029654e-02 -3.59301642e-02
  1.58540696e-

* This is all good , getting the feature vector representation at the end of all the Encoders, but how about if we want the representation after any Encoder layer that we like.

* output from last four Hidden layer outperformed all others if you can recall.

In [None]:
# How about hidden layer outputs

#https://huggingface.co/transformers/model_doc/distilbert.html#distilbertconfig

from transformers import  DistilBertConfig # Every Model in HuggingFace has config ,It tells us what we want the model to be.

config = DistilBertConfig.from_pretrained(distil_bert, output_hidden_states = True)


e     = tokenizer.encode("Hello, my dog is cute")
input = tf.constant(e)[None, :]  # Batch size 1 
model = TFDistilBertModel.from_pretrained(distil_bert, config = config)

print(model.config) # Every model has a config file 

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.15.0",
  "vocab_size": 30522
}



In [None]:
output = model(input) # Will return a tuple ??
print(len(output))

2


In [None]:
print(output)

TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(1, 8, 768), dtype=float32, numpy=
array([[[-1.8296386e-01, -7.4054033e-02,  5.0267726e-02, ...,
         -1.1260688e-01,  4.4493079e-01,  4.0941307e-01],
        [ 7.0607476e-04,  1.4825349e-01,  3.4328306e-01, ...,
         -8.6039692e-02,  6.9474775e-01,  4.3352805e-02],
        [-5.0720620e-01,  5.3085524e-01,  3.7162673e-01, ...,
         -5.6287491e-01,  1.3755690e-01,  2.8475243e-01],
        ...,
        [-4.2251337e-01,  5.7314694e-02,  2.4338314e-01, ...,
         -1.5222691e-01,  2.4462417e-01,  6.4154840e-01],
        [-4.9384442e-01, -1.8895462e-01,  1.2640835e-01, ...,
          6.3240513e-02,  3.6912850e-01, -5.8251895e-02],
        [ 8.3268648e-01,  2.4948204e-01, -4.5439535e-01, ...,
          1.1997566e-01, -3.9257306e-01, -2.7785379e-01]]], dtype=float32)>, hidden_states=(<tf.Tensor: shape=(1, 8, 768), dtype=float32, numpy=
array([[[ 0.3469352 , -0.16263762, -0.23334563, ...,  0.14869013,
          0.08653456,  0.

In [None]:
print(output[0]) # It is a Tensor .

tf.Tensor(
[[[-1.8296386e-01 -7.4054033e-02  5.0267726e-02 ... -1.1260688e-01
    4.4493079e-01  4.0941307e-01]
  [ 7.0607476e-04  1.4825349e-01  3.4328306e-01 ... -8.6039692e-02
    6.9474775e-01  4.3352805e-02]
  [-5.0720620e-01  5.3085524e-01  3.7162673e-01 ... -5.6287491e-01
    1.3755690e-01  2.8475243e-01]
  ...
  [-4.2251337e-01  5.7314694e-02  2.4338314e-01 ... -1.5222691e-01
    2.4462417e-01  6.4154840e-01]
  [-4.9384442e-01 -1.8895462e-01  1.2640835e-01 ...  6.3240513e-02
    3.6912850e-01 -5.8251895e-02]
  [ 8.3268648e-01  2.4948204e-01 -4.5439535e-01 ...  1.1997566e-01
   -3.9257306e-01 -2.7785379e-01]]], shape=(1, 8, 768), dtype=float32)


In [None]:
output[0].shape

TensorShape([1, 8, 768])

In [None]:
output[0][0].shape

TensorShape([8, 768])

In [None]:
type(output[1])

tuple

In [None]:
output[1][0].shape 

# Output[1] will give you all the hidden state outputs 
# output[1][i] will give us the outuput of ith hidden state.

# Hence output[1][0] will give us the first hidden state output

TensorShape([1, 8, 768])

https://trishalaneeraj.github.io/2020-04-04/feature-based-approach-with-bert 

Go through this blog to understand the dimensions 

In [None]:
print(type(output[1]))
print(len(output[1])) # 7 Why?
print(output[1][6]) # Shape:(1,8,768)

<class 'tuple'>
7
tf.Tensor(
[[[-1.8296386e-01 -7.4054033e-02  5.0267726e-02 ... -1.1260688e-01
    4.4493079e-01  4.0941307e-01]
  [ 7.0607476e-04  1.4825349e-01  3.4328306e-01 ... -8.6039692e-02
    6.9474775e-01  4.3352805e-02]
  [-5.0720620e-01  5.3085524e-01  3.7162673e-01 ... -5.6287491e-01
    1.3755690e-01  2.8475243e-01]
  ...
  [-4.2251337e-01  5.7314694e-02  2.4338314e-01 ... -1.5222691e-01
    2.4462417e-01  6.4154840e-01]
  [-4.9384442e-01 -1.8895462e-01  1.2640835e-01 ...  6.3240513e-02
    3.6912850e-01 -5.8251895e-02]
  [ 8.3268648e-01  2.4948204e-01 -4.5439535e-01 ...  1.1997566e-01
   -3.9257306e-01 -2.7785379e-01]]], shape=(1, 8, 768), dtype=float32)


 **Same steps as above, for any Transformer /BERT like model**

### Fine-tuning for various tasks

- Refer: https://arxiv.org/pdf/1810.04805.pdf

**Links to Useful Blogs**

https://medium.com/swlh/a-simple-guide-on-using-bert-for-text-classification-bbf041ac8d04

