Part 2: Text Classification with Transformers

In [1]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from datasets import load_dataset
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

# Step 1: Load and preprocess the IMDb movie reviews dataset
dataset = load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
val_texts = dataset['test']['text']
val_labels = dataset['test']['label']

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)



In [4]:
import torch
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

In [5]:
# Step 2: Build and fine-tune the Transformer-based model (BERT) for text classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    output_dir='./results',
    num_train_epochs=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10,
)

In [7]:
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='weighted')
    }

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Training the model
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [14]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate()

# Print evaluation results
print(eval_results)

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 