# Fine-tuining BERT-base-uncased for sentiment analysis with SetFit/imdb dataset
    model: https://huggingface.co/google-bert/bert-base-uncased
    dataset: https://huggingface.co/datasets/SetFit/imdb

## IMDB movie reviews dataset from SetFit/imdb
  - Movie reviews labeled positive/negative

In [1]:
from datasets import load_dataset

dataset = load_dataset("SetFit/imdb")
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 25000
    })
})

## Data preprocessing
### Tokenize the raw text reviews using the BERT tokenizer

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets



DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

### Format data into acceptable structure for BERT to process
  - drop unecessary and untokenized columns
  - rename label column to "labels" as expected by BERT

In [3]:
tokenized_datasets = tokenized_datasets.remove_columns(["text", "label_text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})

### Use a subset of the reviews for faster training

In [4]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10000))

## Train BERT
### Load BERT with 2 labels for binary classification (positive/negative sentiment)

In [5]:
from transformers import AutoModelForSequenceClassification
import torch
import torch_directml

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

device = torch_directml.device(torch_directml.default_device())
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Data loader

In [6]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=12)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=12)

### Optimizer

In [7]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

### Learning rate scheduler

In [8]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

## Training and Testing

### Training loop

In [10]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 8340/8340 [3:44:34<00:00,  1.25s/it]  

### Evaluate

In [11]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.9231}

In [12]:
torch.save(model.state_dict(), './bert-base-uncased_IMDB.pt')

# BERT performance results

## 10k reviews 10 epochs
    ~92.3% accuracy