In [1]:
import transformers
import torch

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import torch.nn as nn

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. First exploration

In [2]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]

In [4]:
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
batch["labels"] = torch.tensor([1,1])

In [5]:
optimizer = torch.optim.Adam(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

In [6]:
from datasets import load_dataset

In [7]:
raw_dataset = load_dataset("glue","mrpc")
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [8]:
raw_dataset["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [9]:
raw_dataset["train"][4]

{'sentence1': 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .',
 'sentence2': 'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .',
 'label': 1,
 'idx': 4}

# 2. Second Exploration - Data proc

In [10]:
tokenized_sentence_1 = tokenizer(raw_dataset['train']["sentence1"])
tokenized_sentence_2 = tokenizer(raw_dataset['train']["sentence2"])

In [11]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [13]:
def tokenize_function(example):
   return tokenizer(example['sentence1'],example['sentence2'], truncation=True)

In [14]:
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [15]:
from transformers import DataCollatorWithPadding

data_collector = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
samples = tokenized_dataset["train"][:8]
samples = {k: v for k,v in samples.items() if k not in ["idx","sentence1","sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [17]:
batch = data_collector(samples)
{k: v.shape for k,v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

# 3. Fine tuning


In [18]:
# Data proccessing summary code 
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [19]:
# Training
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [20]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import Trainer

trainer = Trainer(
   model,
   training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer
)

In [22]:
# Training model without evaluation
trainer.train()

  0%|          | 0/1377 [00:00<?, ?it/s]

Checkpoint destination directory test-trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.5773, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


Checkpoint destination directory test-trainer\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.3608, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}
{'train_runtime': 135.0117, 'train_samples_per_second': 81.504, 'train_steps_per_second': 10.199, 'train_loss': 0.40138877364458514, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.40138877364458514, metrics={'train_runtime': 135.0117, 'train_samples_per_second': 81.504, 'train_steps_per_second': 10.199, 'train_loss': 0.40138877364458514, 'epoch': 3.0})

In [23]:
# Evaluation
predictions = trainer.predict(tokenized_dataset["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

  0%|          | 0/51 [00:00<?, ?it/s]

(408, 2) (408,)


In [24]:
preds = np.argmax(predictions.predictions, axis=1)
preds

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,

## Evaluation

In [25]:
import evaluate

metric = evaluate.load("glue","mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8602941176470589, 'f1': 0.9032258064516129}

In [26]:
def compute_metrics(eval_preds):
   metric = evaluate.load("glue","mrpc")
   logits, labels = eval_preds
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

In [27]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
   model,
   training_args,
   train_dataset=tokenized_dataset['train'],
   eval_dataset=tokenized_dataset["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
trainer.train()

  0%|          | 0/1377 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.42281776666641235, 'eval_accuracy': 0.8259803921568627, 'eval_f1': 0.8743362831858407, 'eval_runtime': 3.6186, 'eval_samples_per_second': 112.75, 'eval_steps_per_second': 14.094, 'epoch': 1.0}


Checkpoint destination directory test-trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.5638, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.39905601739883423, 'eval_accuracy': 0.8480392156862745, 'eval_f1': 0.8963210702341137, 'eval_runtime': 3.4901, 'eval_samples_per_second': 116.902, 'eval_steps_per_second': 14.613, 'epoch': 2.0}


Checkpoint destination directory test-trainer\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.3749, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.513420045375824, 'eval_accuracy': 0.8529411764705882, 'eval_f1': 0.8951048951048951, 'eval_runtime': 3.6493, 'eval_samples_per_second': 111.801, 'eval_steps_per_second': 13.975, 'epoch': 3.0}
{'train_runtime': 146.3342, 'train_samples_per_second': 75.198, 'train_steps_per_second': 9.41, 'train_loss': 0.40422961775946636, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.40422961775946636, metrics={'train_runtime': 146.3342, 'train_samples_per_second': 75.198, 'train_steps_per_second': 9.41, 'train_loss': 0.40422961775946636, 'epoch': 3.0})