!<img src="./images/MLM.png" alt="MLM" width=1024>

In [19]:
from transformers import BertForMaskedLM, pipeline

In [20]:
bert_lm = BertForMaskedLM.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
bert_lm     # The last layer is a decoder 768 -> vocab_size

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [22]:
from transformers import BertTokenizer

cased_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
nlp = pipeline('fill-mask', model=bert_lm, tokenizer=cased_tokenizer)
# or just nlp = pipeline('fill-mask', model='bert-base-cased')

In [23]:
type(nlp.model)

transformers.models.bert.modeling_bert.BertForMaskedLM

In [24]:
preds = nlp(f"if you don't {nlp.tokenizer.mask_token} at the sign, you will get a ticket.")

print("if you don't *** at the sign you will get a ticket.")

for p in preds:
    print(f"Token:{p['token_str']}. Score: {100*p['score']:,.2f}%")

p.keys()

if you don't *** at the sign you will get a ticket.
Token:stop. Score: 49.02%
Token:look. Score: 40.27%
Token:glance. Score: 1.28%
Token:arrive. Score: 1.10%
Token:turn. Score: 0.83%


dict_keys(['score', 'token', 'token_str', 'sequence'])

!<img src="./images/NSP.png" alt="SNP" width=1024>

In [25]:
from transformers import BertForNextSentencePrediction, BertTokenizer
import torch

In [27]:
uncased_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

bert_nsp = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
bert_nsp

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Labels meaning:
- 0 == "isNextSentence"
- 1 == "notNextSentence"

In [38]:
# Define the sentence pairs
sentence_pairs = [
    ("The weather was perfect for a hike in the mountains.", "She packed her backpack and set out early in the morning."),
    ("The weather was perfect for a hike in the mountains.", "The city was bustling with activity as the festival began.")
]

# Function to encode sentence pairs and make predictions
def predict_nsp(model, tokenizer, sentence_pair):
    encoding = tokenizer.encode_plus(*sentence_pair, return_tensors='pt')
    input_ids = encoding['input_ids']
    token_type_ids = encoding['token_type_ids']
    attention_mask = encoding['attention_mask']

    # Get the prediction logits
    outputs = model(**encoding, labels=torch.LongTensor([1]))
    # outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
    logits = outputs.logits
    softmax = torch.nn.Softmax(dim=1)
    probabilities = softmax(logits)
    predicted_label = torch.argmax(probabilities).item()
    
    return predicted_label, probabilities, logits, outputs

# Predict and print results for each sentence pair
for idx, pair in enumerate(sentence_pairs):
    label, probs, logits, outputs = predict_nsp(bert_nsp, uncased_tokenizer, pair)
    print(f"Sentence Pair {idx + 1}:")
    print(f"Sentence A: {pair[0]}")
    print(f"Sentence B: {pair[1]}")
    print(f"Prediction: {'Next Sentence' if label == 0 else 'Not the Next Sentence'}")
    print(f"Logits: {logits.tolist()}")
    print(f"Probabilities: {probs.tolist()}")
    print(f"Outputs: {outputs}\n")


Sentence Pair 1:
Sentence A: The weather was perfect for a hike in the mountains.
Sentence B: She packed her backpack and set out early in the morning.
Prediction: Next Sentence
Logits: [[5.675405502319336, -5.149261474609375]]
Probabilities: [[0.999980092048645, 1.99020687432494e-05]]
Outputs: NextSentencePredictorOutput(loss=tensor(10.8247, grad_fn=<NllLossBackward0>), logits=tensor([[ 5.6754, -5.1493]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Sentence Pair 2:
Sentence A: The weather was perfect for a hike in the mountains.
Sentence B: The city was bustling with activity as the festival began.
Prediction: Next Sentence
Logits: [[4.564054012298584, -3.68455171585083]]
Probabilities: [[0.9997383952140808, 0.00026155461091548204]]
Outputs: NextSentencePredictorOutput(loss=tensor(8.2489, grad_fn=<NllLossBackward0>), logits=tensor([[ 4.5641, -3.6846]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)



# Common Fine Tuned Berts

In [45]:
from transformers import pipeline, BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering

!<img src="./images/SeqClass.png" alt="Sequence Classification" width=1024>
Same as NSP just with single sequence

In [48]:
bert_sq = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
bert_sq, bert_sq.classifier

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

(BertForSequenceClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerN

!<img src="./images/TknClass.png" alt="Token Classification" width=1024>


In [52]:
bert_tc = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=3)
bert_tc, bert_tc.classifier

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

(BertForTokenClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerNorm

!<img src="./images/QA.png" alt="QA" width=1024>

In [55]:
bert_qa = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
bert_qa, bert_qa.qa_outputs

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

(BertForQuestionAnswering(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerNorm((