# Dataset preparation

1. Importing dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("yelp_review_full")

In [None]:
dataset

In [None]:
dataset["train"][:3]

2. Creating tokenised dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets['train'].features

In [None]:
print(tokenized_datasets['train'][0])

Processing tokenized_dataset

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
# We cannot use text data for training

tokenized_datasets = tokenized_datasets.rename_column("label", "labels") 
# Rename the label column to labels because the model expects the argument to be named labels :O

tokenized_datasets.set_format("torch")

In [None]:
tokenized_datasets['train'].features

Reduced dataset size for faster training

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

3. Defining DataLoader

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [None]:
print(len(train_dataloader)) # we have 8 batches

4. Loading model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
                                            # num_layers refers to the five classes of output

5. Optimizer and learning rate scheduler

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

While optimizers like Adam have adaptive learning rates, they might still require some fine-tuning or annealing during training to achieve better convergence. The scheduler can decrease the learning rate over time, which helps the model to stabilize and find better minima in the loss landscape.

In [None]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

6. GPU code

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

7. Training Loop

In [22]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps),desc='Training', unit='steps')

model.train()   # Some layers behave differently to training and inference. This sets all those 
                 # layers into training mode
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {x: y.to(device) for x, y in batch.items()} # Put tokenised text to GPU
        outputs = model(**batch) # **batch allows you to unpack the key-value pairs from 
                                  # dictionary batch into the correct parameter space of model()
        loss = outputs.loss      # compute loss 
        loss.backward()          # computes gradients
        optimizer.step()         # optimises
        lr_scheduler.step()      # updates lr according to schedule. Improves performance
        optimizer.zero_grad()    # resets the gradients
        progress_bar.update(1)   # updates progress bar by 1

Training:   0%|          | 0/1250 [00:00<?, ?steps/s]

KeyboardInterrupt: 

8. Evaluation

In [None]:
import evaluate

bar = tqdm(range(len(eval_dataloader)))
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # This line calculates the model's predictions by taking the index of the maximum value 
     # along the last dimension of the logits tensor. It is a common way to obtain the class 
      # predictions from the logits. The resulting predictions tensor contains the predicted 
       # class labels for each example in the batch.

    metric.add_batch(predictions=predictions, references=batch["labels"])
    # This line adds the current batch's predictions and the corresponding reference 
     # (ground truth) labels to the metric object. The specific implementation of the 
      # add_batch() method in the evaluate module will store these predictions and labels to
       # compute the evaluation metric later.

    bar.update(1)

metric.compute()