<a href="https://colab.research.google.com/github/Deependrashukla/Deep-Learning/blob/main/LLM_Course_Inference_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install HuggingFace Transformers
!pip install --quiet transformers

In [None]:
# Import the required libraries from Hugging Face Transformers
from transformers import AutoTokenizer, AutoModel

# There are a lot of BERT based models available on HuggingFace,
# and you have to pick one that is suitable for you.
BERT_Model = "bert-base-uncased"

# Initialize the tokenizer, which will break the input text into tokens (sub-words)
tokenizer = AutoTokenizer.from_pretrained(BERT_Model)

# Initialize the LLM that will take the tokens and output the embeddings
model = AutoModel.from_pretrained(BERT_Model)

# Import the function to be used to compute the similarity between word embeddings
from sklearn.metrics.pairwise import cosine_similarity

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
sum([param.numel() for param in model.parameters()])

109482240

# Tokenization using BERT

In [None]:
sent = "Hava you watched the movie Serendipity?"

# Tokenize the sentence
# This basically converts the sentence into a sequence of tokens
# Each token is either a complete word or a sub-word

# encode_plus provides more functionality as compared to encode

tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                padding='max_length', return_tensors='pt')

In [None]:
type(tokens)

In [None]:
# tokens is a dictionary with three keys : input_ids, token_type_ids and attention_mask

tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
input_ids = tokens['input_ids'][0]
print(input_ids)

tensor([  101,  5292,  3567,  2017,  3427,  1996,  3185, 14262, 10497, 11514,
         3012,  1029,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [None]:
print(type(tokens['input_ids']))
print(len(tokens['input_ids']))
print(tokens['input_ids'].shape)

<class 'torch.Tensor'>
1
torch.Size([1, 128])


In [None]:
for token_id in input_ids:
    if token_id != 0:
        print(tokenizer.decode(token_id))

# Notice that all the letters in the words have small letters. Why do you think this is happening?

[CLS]
ha
##va
you
watched
the
movie
ser
##end
##ip
##ity
?
[SEP]


In [None]:
tokenizer.convert_ids_to_tokens(input_ids)

['[CLS]',
 'ha',
 '##va',
 'you',
 'watched',
 'the',
 'movie',
 'ser',
 '##end',
 '##ip',
 '##ity',
 '?',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[

In [None]:
# token_type_ids are required only for specific tasks by Q&A, which we will learn later on.
token_type_ids = tokens['token_type_ids'][0]
print(token_type_ids)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


In [None]:
attention_mask = tokens['attention_mask'][0]
print(attention_mask)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


# Word Embeddings using BERT

In [None]:
# Send the tokens through all the layers of the pre-trained BERT mode to get the output embeddings.
outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [None]:
# last_hidden_state contains the output at the last hidden layer of all the sentence tokens
# pooler_output contains the embedding corresponding to only the [CLS] token, which in a way represents the whole sentence.
# This pooler_output is, however, different from the embeddings corresponding to the 1st token of last_hidden_state
# Although both represent the CLS token, the pooler_output is after some more processing,
# and more suitable for use in sentence classification tasks.

embeddings = outputs.last_hidden_state
cls_embeddings = outputs.pooler_output

print(embeddings.shape)
print(cls_embeddings.shape)

torch.Size([1, 128, 768])
torch.Size([1, 768])


In [None]:
# Each token (sub-word) in the sentence is represented by a vector of size 768
print(len(embeddings[0][0]))
print(len(embeddings[0][1]))
print(len(embeddings[0][2]))

768
768
768


In [None]:
embeddings[0][0]

tensor([ 1.2320e-01, -2.2450e-01, -1.0112e-01, -1.9403e-01, -2.2315e-02,
        -2.6164e-01,  5.3591e-02,  8.1329e-01, -2.6165e-01,  7.8157e-02,
         3.2673e-01, -8.7569e-02, -2.9116e-01,  4.6189e-01,  2.9702e-01,
         9.5158e-02, -1.4104e-02,  3.4397e-01,  3.1298e-01, -1.3877e-01,
         3.7588e-01, -1.6413e-01,  1.7326e-01, -4.9042e-02, -4.1113e-01,
         9.8688e-02,  9.3225e-02, -7.2704e-02, -3.1762e-02, -1.8024e-01,
        -4.4405e-01, -4.9362e-03, -5.7547e-01, -4.3372e-01,  2.3130e-01,
         1.4767e-01,  3.6934e-01,  2.0623e-01, -2.1943e-01,  1.3432e-01,
        -5.7017e-01,  1.5943e-01, -1.7467e-01, -1.4336e-01, -1.0770e-01,
        -1.1867e-01, -2.6593e+00, -2.7817e-01,  6.2663e-02, -3.3873e-01,
         3.3905e-01, -2.3438e-01,  1.5881e-01,  3.7437e-01,  1.8736e-01,
         2.8833e-01, -3.1424e-01,  2.2400e-01,  2.7784e-01,  1.3361e-01,
         3.7000e-01,  1.5592e-01, -5.1301e-01,  1.2569e-01, -1.7687e-02,
        -5.4117e-02, -6.3555e-02,  4.8796e-01, -1.3

# Contextual Similarity of Words

In [None]:
sent_list = ["I love to watch TV",
        "I am wearing a wrist watch",
        "My brother goes to the ground every Sunday to watch Football",
        "My wife gifted me a beautiful watch on my birthday",
        "My wife gifted me a beautiful watch"]

In [None]:
watch_embeddings = []

for sent in sent_list:
    tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                    padding='max_length', return_tensors='pt')
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state

    input_ids = tokens['input_ids'][0]
    for i in range(len(input_ids)):
        if tokenizer.decode(input_ids[i]) == "watch":
            watch_embeddings.append(embeddings[0][i].detach().numpy().reshape(1,-1))

In [None]:
cosine_similarity(watch_embeddings[0],watch_embeddings[1])

array([[0.23662901]], dtype=float32)

In [None]:
cosine_similarity(watch_embeddings[0],watch_embeddings[2])

array([[0.71111065]], dtype=float32)

In [None]:
cosine_similarity(watch_embeddings[1],watch_embeddings[3])

array([[0.40271223]], dtype=float32)

In [None]:
cosine_similarity(watch_embeddings[1],watch_embeddings[4])

array([[0.8410155]], dtype=float32)

# Sentence Similarity using BERT

In [None]:
def sent_embedding(sent):

    # Tokenize the sentence
    # This basically converts the sentence into a sequence of tokens
    # Each token is either a complete word or a sub-word
    tokens = tokenizer.encode_plus(sent, max_length=128, truncation=True,
                                    padding='max_length', return_tensors='pt')

    # Now feed the tokens into the model and get the embeddings as the output
    outputs = model(**tokens)

    # Create an empty list to store two different kinds of embeddings
    embedding_list = []

    # last_hidden_state contains the output at the last hidden layer of all the sentence tokens
    # pooler_output contains the embedding corresponding to only the [CLS] token, which in a way represents the whole sentence.
    # This pooler_output is, however, different from the embeddings corresponding to the 1st token of last_hidden_state
    # Although both represent the CLS token, the pooler_output is after some more processing,
    # and may be more suitable for use in some sentence related tasks.

    # This stores the embedding corresponding to the CLS token
    embedding_list.append(outputs.last_hidden_state[0][0].detach().numpy().reshape(1,-1))

    # This stores the embedding corresponding to the pooler_output
    embedding_list.append(outputs.pooler_output.detach().numpy())

    return embedding_list
    # return

In [None]:
sent1 = "I am a human being."
sent2 = "What are you doing?"

In [None]:
# Sentence similarity using CLS token embedding
cosine_similarity(sent_embedding(sent1)[0],sent_embedding(sent2)[0])

array([[0.8671508]], dtype=float32)

In [None]:
# Sentence similarity using pooler_output
cosine_similarity(sent_embedding(sent1)[1],sent_embedding(sent2)[1])

array([[0.9800672]], dtype=float32)

# Sentence Similarity using SBERT

In [None]:
!pip install --upgrade --quiet sentence-transformers==2.2.2

# Import the Sentence Transformer library
from sentence_transformers import SentenceTransformer, util

# There are several different Sentence Transformer models available on Hugging Face
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# Convert the sentences into embeddings using the Sentence Transformer
sent_embedding1 = model.encode(sent1,convert_to_tensor=True)
sent_embedding2 = model.encode(sent2,convert_to_tensor=True)

In [None]:
# Find the similarity between the two embeddings
util.pytorch_cos_sim(sent_embedding1, sent_embedding2)

tensor([[0.3407]], device='cuda:0')

**DIY:**

How will you use Sentence Similarity for Information Retrieval?

# Question-Answering using BERT

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
# from transformers import BertForQuestionAnswering
# from transformers import BertTokenizer

# model_path = 'bert-large-uncased-whole-word-masking-finetuned-squad'
model_path = "kaporter/bert-base-uncased-finetuned-squad"

#Tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = AutoTokenizer.from_pretrained(model_path)

#Model
# model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

In [None]:
# Triple quotes are used to create docstrings in python, or strings spanning several lines

question = '''What is Machine Learning?'''

# paragraph = '''Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance
#                 on a specific task. Machine learning algorithms build a mathematical model of sample data, known as "training data", in order to make predictions or
#                 decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection
#                 of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning
#                 is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods,
#                 theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory
#                 data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics.'''

paragraph = ""

encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)

inputs = encoding['input_ids']  #Token embeddings
segment_mask = encoding['token_type_ids']  #Segment embeddings
tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

In [None]:
print(tokens)

In [None]:
print(segment_mask)

In [None]:
model_output = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([segment_mask]))

In [None]:
model_output.keys()

In [None]:
start_logits = model_output['start_logits']
end_logits = model_output['end_logits']

In [None]:
start_index = int(torch.argmax(start_logits))
end_index = int(torch.argmax(end_logits))

print(start_index,end_index)

In [None]:
answer = ' '.join(tokens[start_index:end_index+1])
print(answer)

# Token Classification using BERT

 - Named Entity Recognition (NER)
 - POS Tagging

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [None]:
# model_path = "distilbert-base-uncased"
model_path = "stevhliu/my_awesome_wnut_model"
# model_path = "Babelscape/wikineural-multilingual-ner"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

In [None]:
example = "My name is Wolfgang and I live in India."
ner_results = nlp(example)

In [None]:
print(ner_results)

In [None]:
type(ner_results)

In [None]:
len(ner_results)

In [None]:
for item in ner_results:
    print(item['word'], " : ", item['entity_group'])