In [None]:
from transformers import pipeline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy
import tqdm

In [None]:
device = 'mps'

In [None]:
def batch_encoding_to_device(batch):
    return {k: v.to(device) for k, v in batch.items()}

## 3. Fine-Tuning

### 3.1 Processing data

In [None]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model.to(device);

In [None]:
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

In [None]:
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt').to(device)

In [None]:
batch['labels'] = torch.tensor([1, 1]).to(device)

In [None]:
#optimizer = AdamW(model.parameters())
optimizer = torch.optim.AdamW(model.parameters())

In [None]:
loss = model(**batch).loss

In [None]:
loss.backward()

In [None]:
optimizer.step()

In [None]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset('glue', 'mrpc')

In [None]:
raw_datasets

In [None]:
raw_train_dataset = raw_datasets['train']

In [None]:
raw_train_dataset[0]

In [None]:
raw_train_dataset.features

In [None]:
tokenized_sentences_1 = tokenizer(raw_train_dataset['sentence1'])
tokenized_sentences_2 = tokenizer(raw_train_dataset['sentence2'])

In [None]:
tokenized_sentences_1.keys()

In [None]:
token_lengths = []
for i, j in zip(tokenized_sentences_1['input_ids'], tokenized_sentences_2['input_ids']):
    token_lengths.append([len(i), len(j)])

In [None]:
token_lengths = np.array(token_lengths)

In [None]:
plt.hist(token_lengths[:, 0], alpha=0.5)
plt.hist(token_lengths[:, 1], alpha=0.5)

In [None]:
inputs = tokenizer('This is the first sentence.', 'This is the second one.')
inputs1 = tokenizer('This is the first sentence.')
inputs2 = tokenizer('This is the second one.')

In [None]:
# token_type_ids --> which tokens belong to which sentences
inputs

In [None]:
inputs1

In [None]:
inputs2

In [None]:
decoded = tokenizer.convert_ids_to_tokens(inputs['input_ids'])

In [None]:
print(decoded)

In [None]:
tokenized_dataset = tokenizer(
    raw_train_dataset['sentence1'],
    raw_train_dataset['sentence2'],
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [None]:
# no padding in function since it'll be used on batches and batches will have different lengths
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
data_collator

In [None]:
samples = tokenized_datasets['train'][:8]

In [None]:
samples = {k: v for k, v in samples.items() if k not in ['idx', 'sentence1', 'sentence2']}

In [None]:
[len(x) for x in samples['input_ids']]

In [None]:
batch = data_collator(samples)

In [None]:
{k: v.shape for k, v in batch.items()}

### 3.2 Finte-tuning a model with Trainer API

In [None]:
from datasets import load_dataset

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
raw_datasets = load_dataset('glue', 'mrpc')

In [None]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(
    'test-trainer',
    #eval_strategy='epoch', # eval every epoch
    #num_train_epochs=3
)

In [None]:
training_args.per_device_train_batch_size, training_args.per_device_eval_batch_size

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device);

In [None]:
from transformers import Trainer

In [None]:
# handles putting data on device
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    #data_collator=data_collator, # uses data collator with padding by default
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets['validation'])

In [None]:
predictions.predictions.shape, predictions.label_ids.shape

In [None]:
predictions.metrics

In [None]:
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
preds.shape

In [None]:
import evaluate

In [None]:
metric = evaluate.load('glue', 'mrpc')

In [None]:
metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load('glue', 'mrpc')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments('test-trainer', evaluation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
# trainer automatically attaches model to gpu is available
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets['validation'])

In [None]:
predictions.metrics

### 3.3 A full training

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:
raw_datasets = load_dataset('glue', 'mrpc')
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

In [None]:
tokenized_datasets['train'].column_names

In [None]:
import torch
from torch.utils.data import DataLoader

In [None]:
train_dataloader = DataLoader(
    tokenized_datasets['train'], shuffle=True, batch_size=8, collate_fn=data_collator
)

In [None]:
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], shuffle=True, batch_size=8, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
model.to(device);

In [None]:
batch_device = batch_encoding_to_device(batch)

In [None]:
outputs = model(**batch_device)

In [None]:
outputs.loss, outputs.logits.shape

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

In [None]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
num_training_steps

In [None]:
progress_bar = tqdm.auto.tqdm(range(num_training_steps))

In [None]:
lrs = []
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch_device = batch_encoding_to_device(batch)
        outputs = model(**batch_device)
        loss = outputs.loss
        loss.backward()

        lrs.append(lr_scheduler.get_lr())
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
import evaluate

In [None]:
metric = evaluate.load('glue', 'mrpc')

In [None]:
model.eval()
for batch in tqdm.tqdm(eval_dataloader):
    batch_device = batch_encoding_to_device(batch)
    with torch.no_grad():
        outputs = model(**batch_device)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

metric.compute()

### Accelerate (distributed GPU training; can only use the one MPS GPU)

In [None]:
import torch

In [None]:
from accelerate import Accelerator
from transformers import AutoModelForSequenceClassification, get_scheduler

In [None]:
accelerator = Accelerator()

In [None]:
accelerator

In [None]:
accelerator.device

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [None]:
model.to(device);

In [None]:
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

In [None]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=0,
  num_training_steps=num_training_steps
)

In [None]:
progress_bar = tqdm.auto.tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
      outputs = model(**batch)
      loss = outputs.loss
      accelerator.backward(loss)

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)

On command line:
```
accelerate config
accelerate launch train.py
```