# Practical classification with pre-trained BERT

In this notebook I download pre-trained BERT model and fine-tune it with high-level HuggingFace tools.

There is another notebook, doing the same with lower-level PyTorch tools only.

## References:
* https://huggingface.co/course/chapter3/4?fw=pt - HuggingFace transformers course reference

In [1]:
# minimal example of using a pre-trained model for classification

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)

torch.nn.functional.softmax(output.logits, dim=1)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification

tensor([[0.5461, 0.4539],
        [0.5496, 0.4504]], grad_fn=<SoftmaxBackward0>)

In [2]:
import pandas as pd


essays = pd.read_csv("./data/essays.csv")

essays.loc[essays['cEXT'] == 'n', 'cEXT'] = 0
essays.loc[essays['cEXT'] == 'y', 'cEXT'] = 1

essays.loc[essays['cNEU'] == 'n', 'cNEU'] = 0
essays.loc[essays['cNEU'] == 'y', 'cNEU'] = 1

essays.loc[essays['cAGR'] == 'n', 'cAGR'] = 0
essays.loc[essays['cAGR'] == 'y', 'cAGR'] = 1

essays.loc[essays['cCON'] == 'n', 'cCON'] = 0
essays.loc[essays['cCON'] == 'y', 'cCON'] = 1

essays.loc[essays['cOPN'] == 'n', 'cOPN'] = 0
essays.loc[essays['cOPN'] == 'y', 'cOPN'] = 1

essays.astype({'cEXT': 'int32', 'cNEU': 'int32', 'cAGR': 'int32', 'cCON': 'int32', 'cOPN': 'int32'}).dtypes

essays

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0
4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1
...,...,...,...,...,...,...,...
2462,2004_493.txt,I'm home. wanted to go to bed but remembe...,0,1,0,1,0
2463,2004_494.txt,Stream of consiousnesssskdj. How do you s...,1,1,0,0,1
2464,2004_497.txt,"It is Wednesday, December 8th and a lot has be...",0,0,1,0,0
2465,2004_498.txt,"Man this week has been hellish. Anyways, now i...",0,1,0,0,1


In [30]:
import torch
from torch.utils.data import DataLoader, random_split, default_convert
from transformers import AdamW, AutoTokenizer, BertForSequenceClassification
from datasets import Dataset, DatasetDict


# prepare dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(essays):
    return tokenizer(essays["TEXT"], padding="max_length", truncation=True)  # , return_tensors="pt")

essays_dataset = Dataset.from_pandas(essays)
tokenized_dataset = essays_dataset.map(tokenize_function, batched=True, batch_size=8)
tokenized_dataset = tokenized_dataset.rename_column("TEXT", "text")
tokenized_dataset = tokenized_dataset.rename_column("cNEU", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['#AUTHID', 'text', 'cEXT', 'cAGR', 'cCON', 'cOPN'])

train_dataset, validation_dataset = random_split(tokenized_dataset, [2000, len(tokenized_dataset) - 2000])

ds = DatasetDict()
ds['train'] = train_dataset
ds['validation'] = validation_dataset

# vocab = tokenizer.get_vocab()
# ivocab = {v: k for k, v in vocab.items()}
print(ds['train'][0]['input_ids'])

train_dataloader = DataLoader(ds['train'], shuffle=True, batch_size=8)


  0%|                                                            | 0/309 [00:00<?, ?ba/s][A
  8%|███▉                                              | 24/309 [00:00<00:01, 229.01ba/s][A
 15%|███████▌                                          | 47/309 [00:00<00:01, 229.14ba/s][A
 24%|███████████▊                                      | 73/309 [00:00<00:00, 240.76ba/s][A
 32%|███████████████▊                                  | 98/309 [00:00<00:00, 242.20ba/s][A
 40%|███████████████████▌                             | 123/309 [00:00<00:00, 242.07ba/s][A
 48%|███████████████████████▍                         | 148/309 [00:00<00:00, 242.22ba/s][A
 56%|███████████████████████████▍                     | 173/309 [00:00<00:00, 237.86ba/s][A
 64%|███████████████████████████████▏                 | 197/309 [00:00<00:00, 233.78ba/s][A
 72%|███████████████████████████████████              | 221/309 [00:00<00:00, 226.53ba/s][A
 79%|██████████████████████████████████████▋          | 244/309 [00:0

[101, 2061, 2651, 2038, 2042, 2028, 1997, 1996, 5409, 2420, 2412, 1012, 1045, 2179, 2041, 2008, 1045, 2134, 1005, 1056, 2079, 2008, 2307, 2006, 2026, 7366, 11360, 1015, 1012, 1998, 1045, 28163, 2134, 1005, 1056, 2079, 2008, 2307, 2006, 6370, 19461, 1015, 1012, 1045, 2031, 2042, 2667, 2000, 2079, 2023, 3653, 18182, 2075, 5002, 2005, 2058, 2048, 2847, 2085, 1998, 1045, 2572, 2145, 2025, 2589, 2007, 2009, 1012, 1045, 2572, 7501, 2205, 1012, 1045, 4033, 1005, 1056, 2018, 4596, 3892, 1012, 2092, 2025, 2664, 1012, 2061, 1996, 3653, 18182, 2075, 2291, 2003, 2428, 2025, 2551, 1998, 2009, 2003, 2437, 2033, 5506, 2085, 1012, 2023, 2003, 4689, 1012, 1045, 2123, 1005, 1056, 2113, 2054, 2000, 4339, 1012, 2049, 2524, 2000, 4339, 2005, 2322, 3371, 1012, 2009, 2038, 2069, 2042, 1017, 3371, 1012, 2023, 2003, 4689, 1012, 1045, 4687, 2129, 2146, 1045, 2097, 2131, 2000, 2224, 2023, 3274, 1012, 2821, 1010, 1045, 2123, 1005, 1056, 2113, 2065, 2026, 2465, 3642, 2003, 2157, 2030, 2025, 1010, 2021, 1045, 2228,




In [4]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# I'm running this on Apple Silicon. Activate Metal "mps" device, if available:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")


torch.device("mps")
model.to(mps_device)

model.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [61]:
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.optim import AdamW


# parameters
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

cross_entropy_loss = torch.nn.CrossEntropyLoss().to(mps_device)

optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


# test on one batch
# batch = next(iter(train_dataloader))

# labels = batch["labels"]
# del batch["labels"]

# batch = {k: torch.transpose(torch.stack(default_convert(v)), 0, 1) for k, v in batch.items()}
# batch = {k: v.to(mps_device) for k, v in batch.items()}

# output = model(**batch)
# labels.to(mps_device)
# mps_labels = torch.as_tensor(labels, device=mps_device)

# loss = cross_entropy_loss(output.logits, mps_labels)
# loss.backward()


# progress bar
progress_bar = tqdm(range(num_training_steps))

# training
for epoch in range(num_epochs):
    for batch in train_dataloader:
        labels = batch["labels"]
        mps_labels = torch.as_tensor(labels, device=mps_device)
        del batch["labels"]
        
        batch = {k: torch.transpose(torch.stack(default_convert(v)), 0, 1) for k, v in batch.items()}
        batch = {k: v.to(mps_device) for k, v in batch.items()}

        output = model(**batch)

        loss = cross_entropy_loss(output.logits, mps_labels)
        loss.backward()        

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|                                                            | 0/750 [02:42<?, ?it/s]
  0%|                                                          | 0/750 [5:06:36<?, ?it/s]
100%|████████████████████████████████████████████████| 750/750 [1:21:52<00:00,  6.35s/it]

In [None]:
from datasets import load_metric


metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [67]:
from datasets import load_metric


validation_dataloader = DataLoader(ds['validation'], shuffle=True, batch_size=8)

metric = load_metric("accuracy")
model.eval()
for batch in validation_dataloader:
    labels = batch["labels"]
    mps_labels = torch.as_tensor(labels, device=mps_device)
    del batch["labels"]

    batch = {k: torch.transpose(torch.stack(default_convert(v)), 0, 1) for k, v in batch.items()}
    batch = {k: v.to(mps_device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    softmax = torch.nn.Softmax()
    for index, item in enumerate(logits):
        print(f"probabilities = {softmax(item)}")
        print(f"label = {labels[index]}")
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=mps_labels)

metric.compute()

  print(f"probabilities = {softmax(item)}")


probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 0
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 0
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 0
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 1
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 1
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 0
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 1
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 1
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 1
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 0
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 1
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 0
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 0
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label = 1
probabilities = tensor([0.4973, 0.5027], device='mps:0')
label

{'accuracy': 0.48394004282655245}