# Preparing the dataset for finetuning

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("imdb")

In [8]:
from transformers import AutoTokenizer

def tokenize_function(example):   
    checkpoint = "bert-base-cased" 
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer(
        example["text"], 
        padding="max_length",
        truncation=True,
        max_length=128
    )
tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)    


In [9]:
small_train_ds = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_ds = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))
full_train_ds = tokenized_dataset["train"]
full_eval_ds = tokenized_dataset["test"]

# Finetuning in Pytorch with the trainer API

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

checkpoint = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    
    

training_args = TrainingArguments(
    output_dir="ft_model",
    eval_strategy="epoch",
    num_train_epochs=5,
    
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_ds,
    eval_dataset=small_eval_ds,
    compute_metrics=compute_metrics
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.425249,0.812
2,No log,0.67806,0.771
3,No log,0.83696,0.811
4,0.304200,0.800151,0.821
5,0.304200,0.998374,0.824


TrainOutput(global_step=625, training_loss=0.2533877250671387, metrics={'train_runtime': 64.0433, 'train_samples_per_second': 78.072, 'train_steps_per_second': 9.759, 'total_flos': 328888819200000.0, 'train_loss': 0.2533877250671387, 'epoch': 5.0})

In [23]:
trainer.evaluate()

{'eval_loss': 0.9983739852905273,
 'eval_accuracy': 0.824,
 'eval_runtime': 2.7464,
 'eval_samples_per_second': 364.119,
 'eval_steps_per_second': 45.515,
 'epoch': 5.0}

In [25]:
from fine_tuning import prepare_dataset, fine_tune

ds = prepare_dataset()
fine_tune(ds)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'small_train_ds' is not defined