In [1]:
from pprint import pprint
import torch
from torch import optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer
import numpy as np
import evaluate
from torch.utils.data import DataLoader
from transformers import get_scheduler
from tqdm.auto import tqdm

In [2]:
# Load Model and Tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Single Step Train

In [4]:
# Create Batch
sequences = [
	"I've been waiting for a HuggingFace course my whole life.",
	"This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# Add Lables
batch["labels"] = torch.tensor([1, 1])

In [None]:
# Train on single batch
optimizer = optim.AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

## Paraphrase Classifier

### Exploration

In [23]:
raw_datasets = load_dataset("glue", "mrpc")

raw_datasets, raw_datasets["train"], raw_datasets["train"][0]

(DatasetDict({
     train: Dataset({
         features: ['sentence1', 'sentence2', 'label', 'idx'],
         num_rows: 3668
     })
     validation: Dataset({
         features: ['sentence1', 'sentence2', 'label', 'idx'],
         num_rows: 408
     })
     test: Dataset({
         features: ['sentence1', 'sentence2', 'label', 'idx'],
         num_rows: 1725
     })
 }),
 Dataset({
     features: ['sentence1', 'sentence2', 'label', 'idx'],
     num_rows: 3668
 }),
 {'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  'label': 1,
  'idx': 0})

In [25]:
# See labels
raw_datasets["train"].features

{'sentence1': Value('string'),
 'sentence2': Value('string'),
 'label': ClassLabel(names=['not_equivalent', 'equivalent']),
 'idx': Value('int32')}

In [49]:
# Convert sentences to tokens
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentence1 = tokenizer(raw_datasets["train"][0]["sentence1"])
tokenized_sentence2 = tokenizer(raw_datasets["train"][0]["sentence2"])

print(tokenized_sentence1)
print(tokenized_sentence2)

{'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [51]:
raw_datasets["train"][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [42]:
# Pair the sentences to feed into the model
# Notice token_type_ids has 0 and 1, indicating which sentence it is
inputs = tokenizer(raw_datasets["train"][0]["sentence1"], raw_datasets["train"][0]["sentence2"], padding=True, truncation=True, return_tensors="pt")
pprint(inputs)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]]),
 'input_ids': tensor([[  101,  2572,  3217,  5831,  5496,  2010,  2567,  1010,  3183,  2002,
          2170,  1000,  1996,  7409,  1000,  1010,  1997,  9969,  4487, 23809,
          3436,  2010,  3350,  1012,   102,  7727,  2000,  2032,  2004,  2069,
          1000,  1996,  7409,  1000,  1010,  2572,  3217,  5831,  5496,  2010,
          2567,  1997,  9969,  4487, 23809,  3436,  2010,  3350,  1012,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}


In [None]:
# Decode each sentence
# Notice the [SEP] token which tells us where the second sentence starts
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))

['[CLS]', 'am', '##ro', '##zi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '"', 'the', 'witness', '"', ',', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]', 'referring', 'to', 'him', 'as', 'only', '"', 'the', 'witness', '"', ',', 'am', '##ro', '##zi', 'accused', 'his', 'brother', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]']


### Full Pre Processing

In [10]:
# Load Model and Tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Function to tokenize and pair sentences
def tokenize_function(example):
	return tokenizer(
    	example["sentence1"],
     	example["sentence2"],
       	truncation=True
    )

raw_datasets = load_dataset("glue", "mrpc")
# Tokenize all data
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# Collate function is used to make batches
# We need dynamic padding, so we define a collate function
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Training and Evaluation

#### Using Trainer Class

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Now training with compute_metrics
# Now we get val loss and metrics (i.e. accuracy and f1)
training_args = TrainingArguments("test-trainer", eval_strategy="epoch", fp16=True)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Since processing_class is tokenizer, data_collator is set to DataCollatorWithPadding
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator, # Added to show importance
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.357098,0.857843,0.902027
2,0.518100,0.554072,0.840686,0.892562
3,0.270400,0.704931,0.852941,0.896552


TrainOutput(global_step=1377, training_loss=0.32329054024224707, metrics={'train_runtime': 34360.0159, 'train_samples_per_second': 0.32, 'train_steps_per_second': 0.04, 'total_flos': 405114969714960.0, 'train_loss': 0.32329054024224707, 'epoch': 3.0})

#### Using Raw Pytorch

In [11]:
# Doing this again so doesn't fail when re-running
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # Becuase pytorch expects labels
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [12]:
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=True, batch_size=8, collate_fn=data_collator)

for batch in train_dataloader:
	pprint({k: v.shape for k, v in batch.items()})
	break

{'attention_mask': torch.Size([8, 72]),
 'input_ids': torch.Size([8, 72]),
 'labels': torch.Size([8]),
 'token_type_ids': torch.Size([8, 72])}


In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
	"linear",
	optimizer=optimizer,
	num_warmup_steps=0,
	num_training_steps=num_training_steps
)

num_training_steps

1377

In [15]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
	for batch in train_dataloader:
		batch = {k: v.to(device) for k, v in batch.items()}
		outputs = model(**batch)
		loss = outputs.loss
		loss.backward()
		
		optimizer.step()
		lr_scheduler.step()
		optimizer.zero_grad()
		progress_bar.update(1)
  
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in val_dataloader:
	batch = {k: v.to(device) for k, v in batch.items()}
	with torch.no_grad():
		outputs = model(**batch)
		predictions = outputs.logits.argmax(dim=-1)
		metric.add_batch(predictions=predictions, references=batch["labels"])

print(metric.compute())

  0%|          | 0/1377 [00:00<?, ?it/s]

{'accuracy': 0.8504901960784313, 'f1': 0.8939130434782608}
