# Processing the data (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
#!pip install datasets evaluate transformers[sentencepiece]
#!pip install 'accelerate>=0.26.0'
#!pip install scipy


In [2]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModelForSequenceClassification, AutoTokenizer

import torch

# Load and use model

## Tokenizer
Tokenizer is used to encode and decode text.  
encode: token => token_id  
decode: token_id => token  


**Vocabulary size**: number of unique tokens in the vocabulary.  
**Special tokens**: special tokens that are used in the model.
- UNK token: token used to represent unknown tokens.
- SEP token: token used to separate input ids into different sequences.
- PAD token: token used to pad sequences.
- CLS token: token used to start sequences.

In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

print(f"tokenizer.vocab_size: {tokenizer.vocab_size}")
# special tokens
print(f"tokenizer.unk_token: {tokenizer.unk_token} = {tokenizer.unk_token_id}")
print(f"zyxw: {tokenizer.convert_tokens_to_ids(['zyxw'])}")

print(f"tokenizer.sep_token: {tokenizer.sep_token} = {tokenizer.sep_token_id}")
print(f"tokenizer.pad_token: {tokenizer.pad_token} = {tokenizer.pad_token_id}")
print(f"tokenizer.cls_token: {tokenizer.cls_token} = {tokenizer.cls_token_id}")
print(f"tokenizer.mask_token: {tokenizer.mask_token}")


tokenizer.vocab_size: 30522
tokenizer.unk_token: [UNK] = 100
zyxw: [100]
tokenizer.sep_token: [SEP] = 102
tokenizer.pad_token: [PAD] = 0
tokenizer.cls_token: [CLS] = 101
tokenizer.mask_token: [MASK]


## Use for mask filling

In [4]:


checkpoint = "bert-base-uncased"

# Load model
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

def fill_mask(sentence, topk=5):
    """
    Print topk candidates for the masked token in the sentence.
    """
    if "[MASK]" not in sentence:
        raise ValueError("Input sentence must contain [MASK] token.")
    
    # Tokenize input and get tensor
    inputs = tokenizer(sentence, return_tensors="pt")
    # input_ids, token_type_ids, attention_mask
    print(f"inputs.input_ids.shape: {inputs.input_ids.shape}")

    # findout the index of tokens which is masked
    mask_token_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
    print(f"mask_token_index: {mask_token_index}")
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    print(outputs.logits.shape)
    
    # Extract the logits for the masked token
    mask_logits = outputs.logits[0, mask_token_index, :]
    
    # Get top-k tokens
    topk_ids = torch.topk(mask_logits, topk, dim=1).indices[0].tolist()
    topk_tokens = [tokenizer.decode([token_id]) for token_id in topk_ids]

    print(f"sentence: {sentence}")
    # Print predictions
    for i, token in enumerate(topk_tokens, 1):
        print(f"   >>>{i}: {token}")

fill_mask("the capital of India is [MASK].")
fill_mask("the capital of [MASK] is New Delhi.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


inputs.input_ids.shape: torch.Size([1, 9])
mask_token_index: tensor([6])
torch.Size([1, 9, 30522])
sentence: the capital of India is [MASK].
   >>>1: mumbai
   >>>2: delhi
   >>>3: pune
   >>>4: hyderabad
   >>>5: bangalore
inputs.input_ids.shape: torch.Size([1, 10])
mask_token_index: tensor([4])
torch.Size([1, 10, 30522])
sentence: the capital of [MASK] is New Delhi.
   >>>1: india
   >>>2: delhi
   >>>3: district
   >>>4: haryana
   >>>5: state


# Finetuning the model

<img src="images/net_loss_optimizer.png" alt="drawing" width="512"/>  

Image credit: [Deep learning with Python](https://www.manning.com/books/deep-learning-with-python)

##  Finetune with toy dataset using torch

In [6]:
# load model and tokenizer
checkpoint = "bert-base-uncased"
class_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)%%!


# classify few sentences with the model
training_sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
    "The movie was horrible.",
]

test_sentences = [
    "The hotel was not that good.",
    "I hate this so much!",
    "The movie was great.",
]


def predict(sentences:str):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    print(inputs.input_ids.shape)
    with torch.no_grad():
        outputs = class_model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    for sentence, pred in zip(sentences, predictions):
        print(f"{sentence}: {pred}")


print(f"=========before finetuning======")
predict(training_sentences)
predict(test_sentences)

# prepare toy classification dataset out of above sentences
inputs = tokenizer(training_sentences, padding=True, truncation=True, return_tensors="pt")
inputs["labels"] = torch.tensor([1, 1, 0])


# finetune the model

# setup optimizer
optimizer = torch.optim.Adam(class_model.parameters(), lr=5e-5)
# forward pass to calculate loss
loss = class_model(**inputs).loss
# backward pass to calculate gradients
loss.backward()
# update model weights
optimizer.step()


print(f"=========after finetuning======")
print(f"training data:")
predict(training_sentences)

print(f"test data:")
predict(test_sentences)

torch.Size([3, 16])
I've been waiting for a HuggingFace course my whole life.: tensor([0.3098, 0.6902])
This course is amazing!: tensor([0.3245, 0.6755])
The movie was horrible.: tensor([0.2866, 0.7134])
torch.Size([3, 9])
The hotel was not that good.: tensor([0.2710, 0.7290])
I hate this so much!: tensor([0.3006, 0.6994])
The movie was great.: tensor([0.2659, 0.7341])
training data:
torch.Size([3, 16])
I've been waiting for a HuggingFace course my whole life.: tensor([0.2600, 0.7400])
This course is amazing!: tensor([0.3101, 0.6899])
The movie was horrible.: tensor([0.4870, 0.5130])
test data:
torch.Size([3, 9])
The hotel was not that good.: tensor([0.3294, 0.6706])
I hate this so much!: tensor([0.3063, 0.6937])
The movie was great.: tensor([0.4391, 0.5609])


##  Finetune with dataset using huggingface trainer
**Task:** Given a pair of sentences, detect whether the sentence is a paraphrase of another sentence    
**Base model:** [bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)  
**Dataset:** [Glue-mrpc](https://huggingface.co/datasets/nyu-mll/glue/viewer/mrpc)

### Transfer learning

<img src="images/bert_transfer_learning.jpeg" alt="drawing" width="512"/>  

Image credit: [Natural Language Processing with Transformers](.)


In [7]:
checkpoint = "bert-base-uncased" 
# load the model
class_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prepare training data

In [8]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [33]:
# explore samples using indexes just like python dictionaries
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [32]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [11]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [12]:
# tokenizer can take pair of sentences and convert it into a format model requires
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [36]:

print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']


In [14]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [15]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [16]:
# Why use map
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [17]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [19]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

## Finetune model

In [20]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


from transformers import Trainer

# forward pass, backpropagation, weights update all in single command
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


trainer.train()


predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return F.linear(input, self.weight, self.bias)


Step,Training Loss
500,0.5533
1000,0.3516


(408, 2) (408,)


In [23]:
# !pip install scikit-learn
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)


import evaluate
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8529411764705882, 'f1': 0.8979591836734694}

In [26]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments("test-trainer", eval_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.394863,0.848039,0.890459
2,0.527200,0.444929,0.877451,0.910394
3,0.285400,0.536448,0.882353,0.916376


TrainOutput(global_step=1377, training_loss=0.3402927429154894, metrics={'train_runtime': 171.9463, 'train_samples_per_second': 63.997, 'train_steps_per_second': 8.008, 'total_flos': 405114969714960.0, 'train_loss': 0.3402927429154894, 'epoch': 3.0})

# References:
1. [BERT base model (uncased)](https://huggingface.co/google-bert/bert-base-uncased)
2. 