In [None]:
import torch
from torch.optim import AdamW
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [None]:
batch

In [None]:
batch.keys()

In [None]:
batch["labels"]= torch.tensor([1,1])

In [None]:
batch

In [None]:
output = model(**batch)

In [None]:
loss = 0.2

In [None]:
optimizer = AdamW(model.parameters())

In [None]:
loss = model(**batch)*

In [None]:
loss.backward()

In [None]:
optimizer.step

In [None]:
# MRPC dataset! This is one of the 
# 10 datasets composing the GLUE benchmark, which is an academic 
# benchmark that is used to measure the performance of ML models across 
# 10 different text classification tasks.

In [None]:
from datasets import load_dataset

In [None]:
# This command downloads and caches the 
# dataset, by default in ~/.cache/huggingface/datasets.
# Recall from Chapter 2 that you can customize your cache 
# folder by setting the HF_HOME environment variable.



In [None]:
# BERT is pretrained with token type IDs, and on top 
# of the masked language modeling objective we talked about 
# in Chapter 1, it has an additional objective called next 
# sentence prediction. The goal with this task is 
# to model the relationship between pairs of sentences.

In [None]:
#With next sentence prediction, the model is provided pairs of 
# sentences (with randomly masked tokens) and asked to predict
#  whether the second sentence follows the first. To make the 
# task non-trivial, half of the time the sentences follow each 
# other in the original document they were extracted from, and the other half 
# of the time the two sentences come from two different documents

In [None]:
from datasets import load_dataset 

In [None]:
raw_datasets = load_dataset("glue", "mrpc")

In [None]:
raw_datasets.keys()

In [None]:
raw_datasets['train']

In [None]:
raw_datasets['train'].features['sentence1']

In [None]:
raw_datasets['train'].features

In [None]:
raw_datasets['train'].features['sentence1']

In [None]:
raw_datasets

In [None]:
raw_datasets['train'][0]

In [None]:
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tk = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
raw_datasets['train'][0]['sentence1']

In [None]:
raw_token_test = tk(raw_datasets['train'][0]['sentence1'], raw_datasets['train'][0]['sentence2'])

In [None]:
raw_token_test

In [None]:
# this takes two sentences as one. 
# token id type is used to recoginzie which senten

In [None]:
raw_datasets['train']['sentence1'][0:10]

In [None]:
raw_token_test = tk(raw_datasets['train']['sentence1'][0:10], raw_datasets['train']['sentence2'][0:10])

In [None]:
raw_token_test

In [None]:
raw_token_test['input_ids']

In [None]:
tk.convert_ids_to_tokens(raw_token_test["input_ids"][1])

In [None]:
tokenized_dataset = tk(raw_datasets['train']['sentence1'], raw_datasets['train']['sentence2'], padding=True,truncation=True)

In [None]:
tokenized_dataset.keys()

In [None]:
# this works, you can use the whole tokenzied dataset in your memoory but this is not efficient.

In [None]:
def tokenize_function(example):
    # generally you should simply pass a string, string kinda thing 
    # doesn't matter if its train or test or anything
    # Note that we’ve left the padding argument out in 
    # our tokenization function for now. This is 
    # because padding all the samples to the maximum 
    # length is not efficient: it’s better to pad the 
    # samples when we’re building a batch, as then we 
    # only need to pad to the maximum length in that batch, 
    # and not the maximum length in the entire dataset. T
    return tk(example['sentence1'], example['sentence2'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# Here is how we apply the tokenization function on all our datasets at once.
#You can even use multiprocessing when 
# applying your preprocessing function with map() 
# by passing along a num_proc argument. We didn’t 
# do this here because the 🤗 Tokenizers library already 
# uses multiple threads to tokenize our samples faster, but 
# if you are not using a fast tokenizer backed by this library, 
# this could speed up your preprocessin

In [None]:
tokenized_datasets

In [None]:
#The last thing we will need to do is pad all the examples to the length of the longest element when we batch elements 
# together — a technique we refer to as dynamic padding.

In [None]:
tokenized_datasets.keys()

In [None]:
tokenized_datasets['train'][0]['input_ids']

In [None]:
# so we have created tokens from all parts of dataset using 
# raw_data_set.map function with batch.
# no padding was used as we want to pad batch by batch at least.
#This will speed up training by quite a bit, but note that if you’re training on a TPU it can cause problems — 
# TPUs prefer fixed shapes, even when that requires extra padding.


In [None]:
# For applying padding to each batch based seperately,
# we should do dynamic padding
#The function that is responsible for putting 
# together samples inside a batch is called a collate function.

In [None]:
# To do this in practice, we have to define a collate function that 
# will apply the correct amount of padding to the 
# items of the dataset we want to batch together


In [None]:
# Transformers library provides us with such a function via DataCollatorWithPadding.

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
data_collator = DataCollatorWithPadding(tk)

In [None]:
data_collator = DataCollatorWithPadding(tk)


In [None]:
data_collator

In [None]:
# lets take some samples from our tokenized data set we batched together
samples = tokenized_datasets['train'][0:10]

In [None]:
samples.keys()

In [None]:
print(type(tokenized_datasets['train']))

In [None]:
tokenized_datasets['train']

In [None]:
# This is a data set object. 

In [None]:
print(type(tokenized_datasets['train'][:8]))

In [None]:
tokenized_datasets['train'][:8].keys()

In [None]:
samples = tokenized_datasets['train'][:8]

In [None]:
samples['sentence1']

In [None]:
samples['sentence2']

In [None]:
# the goal is to only keep everything execept sentence 1 and sentence 2 and idx
# as we will pass the toke ids with token type id and attetion mask for this batch 
# thrroigh tensor
# so we just need sub-dictionary
# if we have sub-dictionary which doesn't have  sentence 1 and sentence 2 and idx, that should work


In [None]:
samples = {k:v for k, v in samples.items() if k not in ["sentence1", "sentence2", "idx"]}

In [None]:
samples.keys()

In [None]:
samples["input_ids"]

In [None]:
[len(v) for v in samples["input_ids"]]

In [None]:
# we see length of input ids is different. we need to make it same by padding
# dynamic padding should help out

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tk)

In [None]:
batch = data_collator(samples)

In [None]:
batch.keys()

In [None]:
[len(x) for x in batch['input_ids']]

In [None]:
# with DataCollatorWithPadding we got padding applied to the specifuc batch.
# DataCollatorWithPadding must take tokenizer as input as it should know how to 
# apply tokenizr

In [None]:
batch["input_ids"].shape

In [None]:
# DataCollatorWithPadding also create tensors our of the ids.

In [None]:
# lets check the size of each key in the batch

In [None]:
{k:v.shape for k, v in batch.items()}

In [None]:
# DataCollatorWithPadding converts input ids, token type ids, attention mask and label to tensor
# of the same length in a batch


In [None]:
[v.shape for k, v in batch.items()]