In [None]:
import torch
from torch.optim import AdamW
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [None]:
batch

In [None]:
batch.keys()

In [None]:
batch["labels"]= torch.tensor([1,1])

In [None]:
batch

In [None]:
output = model(**batch)

In [None]:
loss = 0.2

In [None]:
optimizer = AdamW(model.parameters())

In [None]:
loss = model(**batch)*

In [None]:
loss.backward()

In [None]:
optimizer.step

In [None]:
# MRPC dataset! This is one of the 
# 10 datasets composing the GLUE benchmark, which is an academic 
# benchmark that is used to measure the performance of ML models across 
# 10 different text classification tasks.

In [None]:
from datasets import load_dataset

In [None]:
# This command downloads and caches the 
# dataset, by default in ~/.cache/huggingface/datasets.
# Recall from Chapter 2 that you can customize your cache 
# folder by setting the HF_HOME environment variable.



In [None]:
# BERT is pretrained with token type IDs, and on top 
# of the masked language modeling objective we talked about 
# in Chapter 1, it has an additional objective called next 
# sentence prediction. The goal with this task is 
# to model the relationship between pairs of sentences.

In [None]:
#With next sentence prediction, the model is provided pairs of 
# sentences (with randomly masked tokens) and asked to predict
#  whether the second sentence follows the first. To make the 
# task non-trivial, half of the time the sentences follow each 
# other in the original document they were extracted from, and the other half 
# of the time the two sentences come from two different documents

In [None]:
from datasets import load_dataset 

In [None]:
raw_datasets = load_dataset("glue", "mrpc")

In [None]:
raw_datasets.keys()

In [None]:
raw_datasets['train']

In [None]:
raw_datasets['train'].features['sentence1']

In [None]:
raw_datasets['train'].features

In [None]:
raw_datasets['train'].features['sentence1']

In [None]:
raw_datasets

In [None]:
raw_datasets['train'][0]

In [None]:
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tk = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
raw_datasets['train'][0]['sentence1']

In [None]:
raw_token_test = tk(raw_datasets['train'][0]['sentence1'], raw_datasets['train'][0]['sentence2'])

In [None]:
raw_token_test

In [None]:
# this takes two sentences as one. 
# token id type is used to recoginzie which senten

In [None]:
raw_datasets['train']['sentence1'][0:10]

In [None]:
raw_token_test = tk(raw_datasets['train']['sentence1'][0:10], raw_datasets['train']['sentence2'][0:10])

In [None]:
raw_token_test

In [None]:
raw_token_test['input_ids']

In [None]:
tk.convert_ids_to_tokens(raw_token_test["input_ids"][1])

In [None]:
tokenized_dataset = tk(raw_datasets['train']['sentence1'], raw_datasets['train']['sentence2'], padding=True,truncation=True)

In [None]:
tokenized_dataset.keys()

In [None]:
# this works, you can use the whole tokenzied dataset in your memoory but this is not efficient.

In [None]:
def tokenize_function(example):
    # generally you should simply pass a string, string kinda thing 
    # doesn't matter if its train or test or anything
    # Note that we’ve left the padding argument out in 
    # our tokenization function for now. This is 
    # because padding all the samples to the maximum 
    # length is not efficient: it’s better to pad the 
    # samples when we’re building a batch, as then we 
    # only need to pad to the maximum length in that batch, 
    # and not the maximum length in the entire dataset. T
    return tk(example['sentence1'], example['sentence2'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# Here is how we apply the tokenization function on all our datasets at once.
#You can even use multiprocessing when 
# applying your preprocessing function with map() 
# by passing along a num_proc argument. We didn’t 
# do this here because the 🤗 Tokenizers library already 
# uses multiple threads to tokenize our samples faster, but 
# if you are not using a fast tokenizer backed by this library, 
# this could speed up your preprocessin

In [None]:
tokenized_datasets

In [None]:
#The last thing we will need to do is pad all the examples to the length of the longest element when we batch elements 
# together — a technique we refer to as dynamic padding.

In [None]:
tokenized_datasets.keys()

In [None]:
tokenized_datasets['train'][0]['input_ids']

In [None]:
# so we have created tokens from all parts of dataset using 
# raw_data_set.map function with batch.
# no padding was used as we want to pad batch by batch at least.
#This will speed up training by quite a bit, but note that if you’re training on a TPU it can cause problems — 
# TPUs prefer fixed shapes, even when that requires extra padding.


In [None]:
# For applying padding to each batch based seperately,
# we should do dynamic padding
#The function that is responsible for putting 
# together samples inside a batch is called a collate function.

In [None]:
# To do this in practice, we have to define a collate function that 
# will apply the correct amount of padding to the 
# items of the dataset we want to batch together


In [None]:
# Transformers library provides us with such a function via DataCollatorWithPadding.

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
data_collator = DataCollatorWithPadding(tk)

In [None]:
data_collator = DataCollatorWithPadding(tk)


In [None]:
data_collator

In [None]:
# lets take some samples from our tokenized data set we batched together
samples = tokenized_datasets['train'][0:10]

In [None]:
samples.keys()

In [None]:
print(type(tokenized_datasets['train']))

In [None]:
tokenized_datasets['train']

In [None]:
# This is a data set object. 

In [None]:
print(type(tokenized_datasets['train'][:8]))

In [None]:
tokenized_datasets['train'][:8].keys()

In [None]:
samples = tokenized_datasets['train'][:8]

In [None]:
samples['sentence1']

In [None]:
samples['sentence2']

In [None]:
# the goal is to only keep everything execept sentence 1 and sentence 2 and idx
# as we will pass the toke ids with token type id and attetion mask for this batch 
# thrroigh tensor
# so we just need sub-dictionary
# if we have sub-dictionary which doesn't have  sentence 1 and sentence 2 and idx, that should work


In [None]:
samples = {k:v for k, v in samples.items() if k not in ["sentence1", "sentence2", "idx"]}

In [None]:
samples.keys()

In [None]:
samples["input_ids"]

In [None]:
[len(v) for v in samples["input_ids"]]

In [None]:
# we see length of input ids is different. we need to make it same by padding
# dynamic padding should help out

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tk)

In [None]:
batch = data_collator(samples)

In [None]:
batch.keys()

In [None]:
[len(x) for x in batch['input_ids']]

In [None]:
# with DataCollatorWithPadding we got padding applied to the specifuc batch.
# DataCollatorWithPadding must take tokenizer as input as it should know how to 
# apply tokenizr

In [None]:
batch["input_ids"].shape

In [None]:
# DataCollatorWithPadding also create tensors our of the ids.

In [None]:
# lets check the size of each key in the batch

In [None]:
{k:v.shape for k, v in batch.items()}

In [None]:
# DataCollatorWithPadding converts input ids, token type ids, attention mask and label to tensor
# of the same length in a batch


In [None]:
[v.shape for k, v in batch.items()]

In [None]:
#  Transformers provides a Trainer class to help you fine-tune any 
# of the pretrained models it provides on your dataset. 
# Once you’ve done all the data preprocessing work in the 
# last section, you have just a few steps left to define the Trainer. 
# The hardest part is likely to be preparing the environment to run Trainer.train()

In [None]:
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer


In [None]:
checkpoint = "bert-base-uncased"
tk = AutoTokenizer.from_pretrained(checkpoint)
raw_datasets = load_dataset("glue", "mrpc")


In [None]:
def tokenize_function(example):
    return tk(example['sentence1'], example['sentence2'], truncation=True)

In [None]:
raw_datasets_tokenized = raw_datasets.map(tokenize_function, batched=True)

In [None]:
raw_datasets_tokenized

In [None]:
data_collator = DataCollatorWithPadding(tk)

In [None]:
#The first step before we can define our Trainer is to define a TrainingArguments class 
# that will contain all the hyperparameters the Trainer will use for training and evaluation. 
# The only argument you have to provide is a directory where the trained model will be saved, 
# as well as the checkpoints along the way. For all the rest, you can leave the defaults,
#  which should work pretty well for a basic fine-tuning.

In [None]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments("model_download")

In [None]:
training_args

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
#you get a warning after instantiating this pretrained model. This is because BERT has not 
# been pretrained on classifying pairs of sentences, so the head of the pretrained model 
# has been discarded and a new head suitable for sequence classification has been added instead. 
# The warnings indicate that some weights were not used (the ones corresponding to the dropped
#  pretraining head) and that some others were randomly initialized (the ones for the new head). 
# It concludes by encouraging you to train the model, which is exactly what we are going to do now.



In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model,
    training_args,
    data_collator = data_collator,
    train_dataset = raw_datasets_tokenized["train"],
    eval_dataset= raw_datasets_tokenized["validation"],
    processing_class = tk,
    
)

In [None]:
trainer.train()

In [None]:
# 

#This will start the fine-tuning (which should take a couple of minutes on a GPU) and 
# report the training loss every 500 steps. It won’t, however, tell you how well (or badly) 
# your model is performing. This is because:

#We didn’t tell the Trainer to evaluate during training by setting eval_strategy 
# in TrainingArguments to either "steps" (evaluate every eval_steps) or "epoch" 
# (evaluate at the end of each epoch).
#We didn’t provide the Trainer with a compute_metrics() function to calculate a 
# metric during said evaluation (otherwise the evaluation would just have printed the loss, 
# which is not a very intuitive number).

In [None]:
predictions = trainer.predict(raw_datasets_tokenized["validation"])

In [None]:
predictions.predictions.shape

In [None]:
# for every example in the eval data.
# we have predictions which are logits.
# hidden dimenssions have been projected into
# 2 dimensional vector
# so logits here are two class un-normalized values of two classes.


In [None]:
predictions.predictions

In [None]:
import numpy as np

In [None]:
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
preds.shape

In [None]:
predictions.predictions.shape

In [None]:
# 408 is the samle or batch
# 2 is the class dimension
# -1 in axis means apply to last axix
# last axis here is class which is 2
# batch, numb of classes in the output

In [None]:
preds

In [None]:
# argmax makes the logits to 1 and 0. 
# no softmax is applied in inference here. 


In [None]:
import evaluate

In [None]:
metric = evaluate.load("glue", "mrpc")

In [None]:
metric.compute(predictions=preds, references= predictions.label_ids)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
import torch
from torch.optim import AdamW

In [None]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("glue", "mrpc")

In [None]:
raw_datasets

In [None]:
# i have to do the tokenization first
checkpoint = "bert-base-uncased"
tk= AutoTokenizer.from_pretrained(checkpoint)

In [None]:
model= AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels =2)

In [None]:
# you can see that last two layers of the model will be reinitialized or added as new 
# to the top of the bert-base-uncased model.

In [None]:
# lets prepare the dataloader first
# for that we need to tokenize the dataset we have

In [None]:
def tokenize_function(examples):

    # I can apply any prreprocessing here if i want to 
    return tk(examples["sentence1"], examples["sentence2"], truncation=True)



raw_datasets_tokenized = raw_datasets.map(tokenize_function, batched=True)


In [None]:
raw_datasets_tokenized

In [None]:
sub_sample_train = raw_datasets_tokenized["train"][0:8]

In [None]:
test_batch = {k:v for k, v in sub_sample_train.items() if k not in ["sentence2", "sentence1", "idx"]}

In [None]:
{k:len(v) for k,v in sub_sample_train.items()}

In [None]:
raw_datasets_tokenized

In [None]:
# we have tokenized dataset with no padding yet.
# lets do the padding now based on batch, prepare data collator
data_collator = DataCollatorWithPadding(tk)

In [None]:
data_collator

In [None]:
raw_datasets_tokenized_padded = data_collator(test_batch)

In [None]:
raw_datasets_tokenized_padded

In [None]:
#data_collator add padding and converts the input ids to tensor as well

In [None]:
{k:v.shape for k, v in raw_datasets_tokenized_padded.items()}

In [None]:
# we  see everything is being padded to 67 and also returned as torch tensor
# this is what goes into the model;

In [None]:
raw_datasets_tokenized

In [None]:
# we will work with raw_datasets_tokenized

# lets pass the  raw_datasets_tokenized_padded batch through the model to see what it predicts 
# just for fun
#  

output = model(**raw_datasets_tokenized_padded)

In [None]:
output.loss

In [None]:
output.logits

In [None]:
output.logits.shape

In [None]:
import numpy as np

In [None]:
prediction_labels = np.argmax(output.logits.detach().numpy(), axis =-1)

In [None]:
prediction_labels

In [None]:
# argmax gives the index, not the value
# class 0 idex is selected for all because it has the higher number

In [None]:
from torch.utils.data import dataloader

In [None]:
from tqdm.auto import tqdm

In [None]:
epochs = 3

In [None]:
raw_datasets_tokenized

In [None]:
# we have got tokenzid data.
# lets prepare dataset for training now.



In [None]:
# removing text as the model only needs numbers
# the datasets raw has this attribute to remove it from all
raw_datasets_tokenized = raw_datasets_tokenized.remove_columns(["sentence1", "sentence2", "idx"])

In [None]:
# change the label to labels
raw_datasets_tokenized = raw_datasets_tokenized.rename_column("label", "labels")

In [None]:
raw_datasets_tokenized

In [None]:
raw_datasets_tokenized["train"]

In [None]:
raw_datasets_tokenized["train"][0]

In [None]:
# you can see all the values are not tensors, so we need to convert them
# to tensors as wel;

In [None]:
raw_datasets_tokenized.set_format("torch")

In [None]:
raw_datasets_tokenized

In [None]:
raw_datasets_tokenized["train"][0]

In [None]:
# everything is in pytorch tensors now

In [None]:
# now we have prepared raw_datasets_tokenized, we can do the dataloader thing

In [None]:
from torch.utils.data import DataLoader

In [None]:
# now we should have dataloader for train and eval npth

train_dataloader = DataLoader(raw_datasets_tokenized["train"],shuffle=True,
                              collate_fn=data_collator, 
                              batch_size=8)
eval_dataloader = DataLoader(raw_datasets_tokenized["validation"],shuffle=True,
                              collate_fn=data_collator, 
                              batch_size=8)

In [None]:
for batch in train_dataloader:
    break


In [None]:
{k:v.shape for k, v in batch.items()}

In [None]:
print(len(train_dataloader))

In [None]:
# i have 3668 samples in training data.
# i have batch size of 8
# so i should have total batches in training data as 3668/8 which is length of my dataloadr
# 
print(raw_datasets_tokenized["train"].num_rows/8)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

In [None]:
outputs = model (**batch)

In [None]:
outputs.loss

In [None]:
outputs.logits

In [None]:
outputs.logits.shape

In [None]:
from torch.optim import AdamW

In [None]:
optimizer = AdamW(model.parameters(), lr= 5e-5)

In [None]:
# we have got the optimizer, now we want to 
# have learning rate schdule

from transformers import get_scheduler
num_epochs = 3

# Finally, the learning rate scheduler used by default is just a linear decay 
# from the maximum value (5e-5) to 0. To properly define it, we need to know the 
# number of training steps we will take, which is the number of epochs we want to 
# run multiplied by the number of training batches (which is the length of our training dataloader). 
# The Trainer uses three epochs by default, so we will follow that:

In [None]:
# lets find out number of training steps.
# number of batches in training data is 459 which is simply length of the trainin data loader
# 1 epsch is goiung through all the 459 batches.
# 3 epoch is 459*3 = 1377

# unit epoch  number of training steps is number of batches we have in training data loader
# weights are updated after each batch
# training step is counted when weights are being updated
# so we will update the weights 459*3 = 1377 times as we have a num_epochs as 3.


In [None]:
num_training_steps = len(train_dataloader)*num_epochs

In [None]:
num_training_steps

In [None]:
lr_scheduler = get_scheduler("linear", optimizer=optimizer,
              num_warmup_steps =0,
              num_training_steps= num_training_steps,
            )

In [None]:
lr_scheduler

In [None]:
# now we have got learning rate schedular
# 
import torch

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("ussing cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("using mps")
else:
    device = torch.device("cpu")
    print("usinh cpu")


In [None]:
torch.cuda.is_available()

In [None]:
torch.backends.mps.is_available()

In [None]:
# now we have device.
# now map the model to device.
model.to(device)

In [None]:
device

In [None]:
from tqdm.auto import tqdm

In [None]:
model.to(device)

In [None]:
device

In [None]:
progress_bar = tqdm(range(num_training_steps))

In [None]:
print(type(train_dataloader))

In [None]:
train_dataloader

In [None]:
model.train()
for epochs in range(num_epochs):
    for batch in train_dataloader:
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)



    



In [None]:
metric.add_batch

In [None]:
import evaluate
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis = -1)
    metric.add_batch(predictions = predictions, references=batch["labels"])




In [None]:
metric.compute()