In [5]:
import os
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer

os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

In [13]:
def sample(dataset: DatasetDict) -> DatasetDict:
    return DatasetDict(
        {
            'train': dataset["train"].shuffle().select(range(100)),
            'test': dataset["test"].shuffle().select(range(100))
        }
    )

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def tokenise_fn(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

def postprocess(dataset: DatasetDict) -> DatasetDict:
    dataset = dataset.remove_columns(["text"])
    dataset = dataset.rename_column("label", "labels")
    dataset.set_format("torch")
    return dataset

In [14]:
dataset = load_dataset("yelp_review_full")
dataset = sample(dataset)
dataset = dataset.map(tokenise_fn, batched=True)
dataset = postprocess(dataset)

Map: 100%|██████████| 100/100 [00:00<00:00, 1375.63 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1606.53 examples/s]


In [16]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(dataset["train"], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(dataset["test"], batch_size=8)

In [18]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from torch.optim import AdamW
from transformers import get_scheduler
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):

    for batch in train_dataloader:

        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)

        loss = outputs.loss

        loss.backward()

        optimizer.step()

        lr_scheduler.step()

        optimizer.zero_grad()

        progress_bar.update(1)

In [None]:
import evaluate

metric = evaluate.load("accuracy")

model.eval()

for batch in eval_dataloader:

    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.no_grad():

        outputs = model(**batch)

    logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1)

    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()