## 1. Import Libraries

In [12]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

## 2. Choose pretrained model

In [13]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)  #Loads BERT with a classification head for binary output (num_labels=2)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


## 3. Preparing Dataset:

In [14]:
dataset = load_dataset("imdb")

def preprocess_function(examples):             # for tokenization
    return tokenizer(examples["text"], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

## 4. Finetuning the model

In [16]:
training_args = TrainingArguments(
    output_dir = "./results",
    eval_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    num_train_epochs = 3
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.213896,0.195888
2,0.128243,0.242605
3,0.066615,0.268499


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=4689, training_loss=0.14653386701574506, metrics={'train_runtime': 27537.9672, 'train_samples_per_second': 2.724, 'train_steps_per_second': 0.17, 'total_flos': 1.9733329152e+16, 'train_loss': 0.14653386701574506, 'epoch': 3.0})

## 5. Evaluation Model

In [17]:
results = trainer.evaluate()

In [18]:
print(f"Results keys: {results.keys()}")
print(f"Validation loss: {results['eval_loss']}")

Results keys: dict_keys(['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])
Validation loss: 0.2684987187385559
