# Instructions to use this notebook
This notebook follows steps from https://huggingface.co/learn/nlp-course/chapter7/7?fw=pt

There are 2 other important files in the same folder:
- model.py (contains our custom model architecture)
- utils.py (contains our preprocessing and metric functions)

If you want to define your custom dense layers at the end, you should only make changes to `CustomDenseLayers` class in `model.py`

# Install & Import necessary libraries

In [15]:
# Install Hugging Face libraries
! pip install datasets transformers accelerate evaluate



In [1]:
# Import Std and Huggingface libraries
from tqdm.auto import tqdm
import numpy as np

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import default_data_collator, get_scheduler
from transformers import AutoTokenizer
from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm


Please RERUN the below cell whenever you make changes to `model.py` or `utils.py` file.

In [2]:
# Import custom models and helper functions
%load_ext autoreload
%autoreload 2
from model import DistilBertCustomDense
import utils

# Set seed and global variables

In [3]:
### Set seed
torch.manual_seed(42)
np.random.seed(42)

In [4]:
### Global configuration
CONFIG = {
    # For pretrained weight
    "model_checkpoint": "distilbert-base-uncased", # specify pretrained weight from hugging face to use
    # For data preprocessing
    "max_length": 384,
    "stride": 128,
    # For training
    "batch_size": 32,
    "num_train_epochs": 2,
    "output_dir": "frozen-distilbert-custom", # specify output directory to save model
    # For evaluation
    "n_best": 20,
    "max_answer_length": 30
}

In [5]:
### Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_checkpoint"])

# Data preprocessing and preparation

## Preprocess and preparing training dataset

In [None]:
raw_datasets = load_dataset("squad")

In [6]:
train_dataset = raw_datasets["train"].map(
    lambda examples: utils.preprocess_training_examples(examples, tokenizer, CONFIG),
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
train_dataset.set_format("torch")
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=CONFIG["batch_size"],
)

Map: 100%|██████████| 87599/87599 [00:31<00:00, 2786.48 examples/s]


## Preprocess and preparing validation dataset

In [7]:
validation_dataset = raw_datasets["validation"].map(
    lambda examples: utils.preprocess_validation_examples(examples, tokenizer, CONFIG),
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=CONFIG["batch_size"]
)

Map: 100%|██████████| 10570/10570 [00:05<00:00, 1949.82 examples/s]


# Load model and freeze/unfreeze bert layers

You can comment out `model.freeze_distilbert()` if you want to fine-tune the bert layers as well.

In [8]:
# Load our custom model with existing pretrained weight
model = DistilBertCustomDense.from_pretrained(CONFIG["model_checkpoint"])
# Freeze bert part
model.freeze_distilbert()

You are using a model of type distilbert to instantiate a model of type distilbertcustom. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertCustomDense were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.fc.weight', 'qa_outputs.fc.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Prepare optimizer, scheduler, accelerator

In [9]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

In [10]:
# Accelerator
accelerator = Accelerator(mixed_precision='fp16')
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [11]:
# Scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps= CONFIG["num_train_epochs"] * len(train_dataloader),
)

# Main Training Loop

In [None]:
progress_bar = tqdm(range(CONFIG["num_train_epochs"] * len(train_dataloader)))

In [12]:
for epoch in range(CONFIG["num_train_epochs"]):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(validation_dataset)]
    end_logits = end_logits[: len(validation_dataset)]

    metrics = utils.compute_metrics(
        CONFIG, start_logits, end_logits, validation_dataset, raw_datasets["validation"]
    )
    print(f"epoch {epoch}:", metrics)

    # Save model
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(CONFIG["output_dir"], save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(CONFIG["output_dir"])

  0%|          | 0/5534 [00:00<?, ?it/s]

 50%|████▉     | 2766/5534 [05:39<05:44,  8.04it/s]

Evaluation!


100%|██████████| 337/337 [00:37<00:00,  9.01it/s]
100%|██████████| 10570/10570 [00:14<00:00, 710.06it/s]


epoch 0: {'exact_match': 1.608325449385052, 'f1': 7.281976176451196}


100%|█████████▉| 5533/5534 [12:28<00:00,  7.98it/s]   

Evaluation!


100%|██████████| 337/337 [00:37<00:00,  9.03it/s]
100%|██████████| 10570/10570 [00:14<00:00, 715.89it/s]


epoch 1: {'exact_match': 2.166508987701041, 'f1': 8.142934759430274}


# Inference using trained model

In [13]:
# Import necessary classes for pipeline
from model import DistilBertCustomDense, DistilBertCustomConfig
from transformers import DistilBertTokenizer, AutoModelForQuestionAnswering, pipeline

AutoModelForQuestionAnswering.register(DistilBertCustomConfig, DistilBertCustomDense)

In [14]:
model_checkpoint = CONFIG["output_dir"]

model = DistilBertCustomDense.from_pretrained(model_checkpoint)
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'score': 0.004100712016224861, 'start': 83, 'end': 90, 'answer': 'PyTorch'}