In [1]:
import torch
import pandas as pd
import numpy as np

In [2]:
from datasets import load_dataset

dataset = load_dataset("imdb")
dataset["train"].features

Found cached dataset imdb (/Users/louiechou/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [3]:
from transformers import BertTokenizerFast

tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained("bert-base-uncased")


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
    )


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Loading cached processed dataset at /Users/louiechou/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-e0a0342ae289143d.arrow
Loading cached processed dataset at /Users/louiechou/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-fd8ddff947474c37.arrow
Loading cached processed dataset at /Users/louiechou/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-2a73e6194285aadb.arrow


In [4]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

Loading cached shuffled indices for dataset at /Users/louiechou/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-95a7ea67f59766e0.arrow
Loading cached shuffled indices for dataset at /Users/louiechou/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-db88639656d75d2e.arrow


<a id='trainer'></a>

In [5]:
from transformers import AutoModelForSequenceClassification, AutoConfig

config = AutoConfig.from_pretrained(
    "bert-base-uncased",
    # num_labels=2,
)
# model = AutoModelForSequenceClassification.from_config(config)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="bert_pretrained",
    evaluation_strategy="epoch",
    use_mps_device=torch.backends.mps.is_available(),
    optim="adamw_torch",
)

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [9]:
trainer.train()

  0%|          | 0/3750 [00:00<?, ?it/s]

{'loss': 0.4429, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}
{'loss': 0.3538, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.2703028619289398, 'eval_accuracy': 0.897, 'eval_runtime': 43.0683, 'eval_samples_per_second': 23.219, 'eval_steps_per_second': 2.902, 'epoch': 1.0}
{'loss': 0.2559, 'learning_rate': 3e-05, 'epoch': 1.2}
{'loss': 0.2102, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}
{'loss': 0.2097, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.31416159868240356, 'eval_accuracy': 0.925, 'eval_runtime': 41.9732, 'eval_samples_per_second': 23.825, 'eval_steps_per_second': 2.978, 'epoch': 2.0}
{'loss': 0.0981, 'learning_rate': 1e-05, 'epoch': 2.4}
{'loss': 0.0762, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.3839857578277588, 'eval_accuracy': 0.928, 'eval_runtime': 42.8276, 'eval_samples_per_second': 23.349, 'eval_steps_per_second': 2.919, 'epoch': 3.0}
{'train_runtime': 14674.2224, 'train_samples_per_second': 2.044, 'train_steps_per_second': 0.256, 'train_loss': 0.22652693277994793, 'epoch': 3.0}


TrainOutput(global_step=3750, training_loss=0.22652693277994793, metrics={'train_runtime': 14674.2224, 'train_samples_per_second': 2.044, 'train_steps_per_second': 0.256, 'train_loss': 0.22652693277994793, 'epoch': 3.0})

In [10]:
trainer.save_model()
trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.39693766832351685,
 'eval_accuracy': 0.92648,
 'eval_runtime': 1213.4597,
 'eval_samples_per_second': 20.602,
 'eval_steps_per_second': 2.575,
 'epoch': 3.0}