In [2]:
import os
import numpy as np
import torch


In [3]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 256
SEED = 42

In [5]:
def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

In [6]:
# IMDb dataset: binary sentiment classification
ds = load_dataset("imdb")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

In [7]:
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
    )

In [8]:
ds_tok = ds.map(tokenize_fn, batched=True, remove_columns=["text"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")

Map: 100%|██████████| 25000/25000 [00:03<00:00, 7966.26 examples/s]


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)


In [None]:
def train_full_finetune(output_dir: str = "/Users/ashishsinha/Documents/workspace/bert_full_ft"):
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=2
    )

    args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=100,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=2e-5,
        num_train_epochs=2,
        weight_decay=0.01,
        warmup_ratio=0.06,
        fp16=torch.cuda.is_available(),   # uses mixed precision if CUDA present
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    metrics = trainer.evaluate()
    print("Full FT eval:", metrics)

    # Saves full model checkpoint (large, hundreds of MB)
    trainer.save_model(output_dir)
    return metrics


train_full_finetune()


