<a href="https://colab.research.google.com/github/AanchalA/WeekendProjects/blob/main/Fastest_Model_for_the_Win.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Training Time Challenge (Going from 22.53 minutes to 2.75 minutes)  while maintaining the model’s prediction accuracy.

https://sebastianraschka.com/blog/2023/pytorch-faster.html

- Fine Tuning DistillBERT by replacing / adding an output layer and finetune the all model layers.

In [None]:
! pip install -q watermark transformers datasets torchmetrics lightning torchinfo

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m927.3/927.3 kB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from watermark import watermark

print(watermark(packages="torch,lightning,transformers", python=True))

Python implementation: CPython
Python version       : 3.11.11
IPython version      : 7.34.0

torch       : 2.5.1+cu124
lightning   : 2.5.0.post0
transformers: 4.48.3



# 1. Hyperparameter Setup

In [None]:
import os
import torch

SEED = 123
torch.manual_seed(SEED)

NUM_EPOCHS = 10
NUM_CLASSES = 2
LEARNING_RATE = 5e-5
NUM_WORKERS = os.cpu_count()

MODEL = "distilbert-base-uncased"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# torch.set_default_device(device=DEVICE)

In [None]:
print(f"Using Device: {DEVICE}")

Using Device: cuda


# 2. Data Setup

 IMDB movie review dataset - downloading and partitioning the dataset into 35,000 training examples, 5,000 validation set records, and 10,000 test records.

In [None]:
import os
import os.path as op

import sys
import time
import urllib
import tarfile
import numpy as np
import pandas as pd
from tqdm import tqdm
from packaging import version

In [None]:
def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return

    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = progress_size / (1024.0**2 * duration)
    percent = count * block_size * 100.0 / total_size

    sys.stdout.write(
        f"\r{int(percent)}% | {progress_size / (1024.**2):.2f} MB "
        f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
    )
    sys.stdout.flush()

In [None]:
def download_dataset():
    source = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    target = "aclImdb_v1.tar.gz"

    if os.path.exists(target):
        os.remove(target)

    if not os.path.isdir("aclImdb") and not os.path.isfile("aclImdb_v1.tar.gz"):
        urllib.request.urlretrieve(source, target, reporthook)

    if not os.path.isdir("aclImdb"):

        with tarfile.open(target, "r:gz") as tar:
            tar.extractall()

In [None]:
def load_dataset_into_to_dataframe():
    basepath = "aclImdb"

    labels = {"pos": 1, "neg": 0}

    df = pd.DataFrame()

    with tqdm(total=50000) as pbar:
        for s in ("test", "train"):
            for l in ("pos", "neg"):
                path = os.path.join(basepath, s, l)
                for file in sorted(os.listdir(path)):
                    with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                        txt = infile.read()

                    if version.parse(pd.__version__) >= version.parse("1.3.2"):
                        x = pd.DataFrame(
                            [[txt, labels[l]]], columns=["review", "sentiment"]
                        )
                        df = pd.concat([df, x], ignore_index=False)

                    else:
                        df = df.append([[txt, labels[l]]], ignore_index=True)
                    pbar.update()
    df.columns = ["text", "label"]

    np.random.seed(0)
    df = df.reindex(np.random.permutation(df.index))

    print("Class distribution:")
    np.bincount(df["label"].values)

    return df

In [None]:
def partition_dataset(df):
    df_shuffled = df.sample(frac=1, random_state=1).reset_index()

    df_train = df_shuffled.iloc[:35_000]
    df_val = df_shuffled.iloc[35_000:40_000]
    df_test = df_shuffled.iloc[40_000:]

    df_train.to_csv("train.csv", index=False, encoding="utf-8")
    df_val.to_csv("val.csv", index=False, encoding="utf-8")
    df_test.to_csv("test.csv", index=False, encoding="utf-8")

## 2.1. Defining PyTorch Dataset Class

In [None]:
from torch.utils.data import Dataset

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

## 2.2. Loading the Dataset

In [None]:
from datasets import load_dataset

In [None]:
download_dataset()
df = load_dataset_into_to_dataframe()
if not (op.exists("train.csv") and op.exists("val.csv") and op.exists("test.csv")):
    partition_dataset(df)

imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

100% | 80.23 MB | 3.71 MB/s | 21.62 sec elapsed

100%|██████████| 50000/50000 [00:46<00:00, 1066.32it/s]


Class distribution:


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## 2.3. Data Tokenization and Numericalization

In [None]:
from transformers import AutoTokenizer

In [None]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

 - Tokenizer input max length: 512
 - Tokenizer vocabulary size: 30522

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
print("Tokenizer input max length:", tokenizer.model_max_length, flush=True)
print("Tokenizer vocabulary size:", tokenizer.vocab_size, flush=True)

print("Tokenizing ...", flush=True)
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

del imdb_dataset

imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

os.environ["TOKENIZERS_PARALLELISM"] = "false"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522
Tokenizing ...


Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

## 2.4. Set Up DataLoaders

In [None]:
from torch.utils.data import DataLoader

In [None]:
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

In [None]:
print(train_dataset[0].keys())
print(f"label shape: {train_dataset[0]['label'].size(), train_dataset[0]['label'].ndim}")
print(f"input_ids shape: {train_dataset[0]['input_ids'].shape, train_dataset[0]['input_ids'].ndim}")
print(f"attention_mask shape: {train_dataset[0]['attention_mask'].shape, train_dataset[0]['attention_mask'].ndim}")

dict_keys(['label', 'input_ids', 'attention_mask'])
label shape: (torch.Size([]), 0)
input_ids shape: (torch.Size([512]), 1)
attention_mask shape: (torch.Size([512]), 1)


In [None]:
train_dataset[0]['label'], train_dataset[0]['input_ids'][:5], train_dataset[0]['attention_mask'][:5]

(tensor(1), tensor([ 101, 2043, 2057, 2318, 3666]), tensor([1, 1, 1, 1, 1]))

In [None]:
generator = torch.Generator(device=DEVICE)

train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=12,
        shuffle=True,
        # num_workers=NUM_WORKERS,
        drop_last=True,
        # generator=generator
    )

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    # num_workers=NUM_WORKERS,
    drop_last=True,
    # generator=generator
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    # num_workers=NUM_WORKERS,
    drop_last=True,
    # generator=generator
)

In [None]:
next(iter(train_loader))


{'label': tensor([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]),
 'input_ids': tensor([[  101,  1996,  6071,  ...,     0,     0,     0],
         [  101,  1045,  2572,  ...,     0,     0,     0],
         [  101,  2045,  2003,  ...,     0,     0,     0],
         ...,
         [  101,  1996, 12665,  ...,     0,     0,     0],
         [  101,  1996,  2466,  ...,     0,     0,     0],
         [  101,  5064,  1010,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

# 3. Loading and Initializing the Model

In [None]:
from torchinfo import summary
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_CLASSES)
model.to(DEVICE)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# Get a summary of Distil-BERT-uncased feature extractor for Food101 with 101 output classes (uncomment for full output)
# summary(model,
#         input_size=(1, 512, 768),
#         col_names=["input_size", "output_size", "num_params", "trainable"],
#         col_width=20,
#         row_settings=["var_names"])

In [None]:
model.loss_function
# model.loss_type

# 4. Starting with Plain PyTorch Model

In [None]:
import time
import torch
import torchmetrics
from tqdm.auto import tqdm

## 4.1. Defining the Training Function

In [None]:
def train(model, optimizer,
          train_loader, val_loader):

    for epoch in tqdm(range(NUM_EPOCHS)):
        train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES).to(DEVICE)

        for batch_index, batch_data in enumerate(train_loader):
            model.train()

            for data_type in ["input_ids", "attention_mask", "label"]:
                batch_data[data_type] = batch_data[data_type].to(DEVICE)        ## Moving "input_ids", "attention_mask", "label" to device

            ### FORWARD PASS AND BACKPROP

            outputs = model(batch_data["input_ids"],
                           attention_mask=batch_data["attention_mask"],
                           labels=batch_data["label"])

            optimizer.zero_grad()
            outputs["loss"].backward()
            optimizer.step()

            ### LOGGING
            if not batch_index % 300:
                print(f"Epoch: {epoch + 1: 04d} / {NUM_EPOCHS: 04d} | "
                      f"Batch {batch_index: 04d} / {len(train_loader): 04d} | "
                      f"Loss: {outputs['loss']:.4f}"
                )

            model.eval()
            with torch.inference_mode():
                predictions = torch.argmax(outputs["logits"], dim=1)
                train_acc.update(predictions, batch_data["label"])


        # VALIDATION LOOP
        model.eval()
        with torch.inference_mode():
            val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES).to(DEVICE)

            for batch_data in val_loader:
                for data_type in ["input_ids", "attention_mask", "label"]:
                    batch_data[data_type] = batch_data[data_type].to(DEVICE)

                outputs = model(batch_data["input_ids"],
                           attention_mask=batch_data["attention_mask"],
                           labels=batch_data["label"])

                predictions = torch.argmax(outputs["logits"], dim=1)
                val_acc.update(predictions, batch_data["label"])


        print(f"Epoch: {epoch + 1: 04d} / {NUM_EPOCHS: 04d} | "
              f"Train Accuracy: {train_acc.compute() * 100: .2f}% | "
              f"Val Accuracy: {val_acc.compute() * 100: .2f}%")

## 4.2. Finetuning the Model

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
model.to(DEVICE)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
start = time.time()

train(model=model, optimizer=optimizer,
      train_loader=train_loader, val_loader=val_loader)

end = time.time()
elapsed = end - start
print(f"Time elapsed {elapsed / 60: .2f} mins")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch:  001 /  010 |Batch  000 /  2916 |Loss: 0.7092
Epoch:  001 /  010 |Batch  300 /  2916 |Loss: 0.3572
Epoch:  001 /  010 |Batch  600 /  2916 |Loss: 0.5735
Epoch:  001 /  010 |Batch  900 /  2916 |Loss: 0.2086
Epoch:  001 /  010 |Batch  1200 /  2916 |Loss: 0.2498
Epoch:  001 /  010 |Batch  1500 /  2916 |Loss: 0.1524
Epoch:  001 /  010 |Batch  1800 /  2916 |Loss: 0.0925
Epoch:  001 /  010 |Batch  2100 /  2916 |Loss: 0.0874
Epoch:  001 /  010 |Batch  2400 /  2916 |Loss: 0.3659
Epoch:  001 /  010 |Batch  2700 /  2916 |Loss: 0.0424
Epoch:  001 /  010 |Train Accuracy:  89.51% |Val Accuracy:  92.73%
Epoch:  002 /  010 |Batch  000 /  2916 |Loss: 0.2697
Epoch:  002 /  010 |Batch  300 /  2916 |Loss: 0.0748
Epoch:  002 /  010 |Batch  600 /  2916 |Loss: 0.0247
Epoch:  002 /  010 |Batch  900 /  2916 |Loss: 0.0669
Epoch:  002 /  010 |Batch  1200 /  2916 |Loss: 0.0080
Epoch:  002 /  010 |Batch  1500 /  2916 |Loss: 0.0632
Epoch:  002 /  010 |Batch  1800 /  2916 |Loss: 0.0188
Epoch:  002 /  010 |Bat

## 4.3 Model Evaluation on Test Dataset

In [None]:
model.eval()
with torch.inference_mode():
    test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES).to(DEVICE)

    for batch_data in test_loader:
        for data_type in ["input_ids", "attention_mask", "label"]:
            batch_data[data_type] = batch_data[data_type].to(DEVICE)

        outputs = model(batch_data["input_ids"],
                    attention_mask=batch_data["attention_mask"],
                    labels=batch_data["label"])

        predictions = torch.argmax(outputs["logits"], dim=1)
        test_acc.update(predictions, batch_data["label"])

print(f"Test Accuracy: {test_acc.compute() * 100: .2f}%")

As we can see above, the `model starts overfitting slightly from epochs 2 to 3`, and the `validation accuracy decreased from 92.09% to 89.88%`. The final `test accuracy is 89.92%`, which we reached after finetuning the model for 21.33 min.

# 5. Using the Trainer Class from PyTorch Lightning

The main change is in the code section where we finetune the model. What’s new is that we are now wrapping the PyTorch model in the LightningModel class and using the `Trainer` class to fit the model.

> Note that if we disable checkpointing and allow PyTorch to run in non-deterministic mode, we would get the same runtime as will plain PyTorch.

In [None]:
import time
import pandas as pd
import torchmetrics
import lightning as L
import matplotlib.pyplot as plt
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

## 5.1. Defining the Lightning Model Class

In [None]:
class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate):
        super().__init__()

        self.model = model
        self.learning_rate = learning_rate

        self.train_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES)
        self.validation_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES)
        self.test_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch_data, batch_index):
        outputs = self.model(batch_data["input_ids"],
                             attention_mask=batch_data["attention_mask"],
                             labels=batch_data["label"])

        self.log("train_loss", outputs["loss"])

        with torch.inference_mode():
            logits = outputs["logits"]
            predictions = torch.argmax(logits, dim=1)
            self.train_accuracy(predictions, batch_data["label"])
            self.log("train_accuracy", self.train_accuracy, on_epoch=True, on_step=False)

        return outputs["loss"]          ##  This is passed to the optimizer for training


    def validation_step(self, batch_data, batch_index):
        outputs = self.model(batch_data["input_ids"],
                             attention_mask=batch_data["attention_mask"],
                             labels=batch_data["label"])

        self.log("validation_loss", outputs["loss"], prog_bar=True)

        # with torch.inference_mode():
        logits = outputs["logits"]
        predictions = torch.argmax(logits, dim=1)
        self.validation_accuracy(predictions, batch_data["label"])
        self.log("validation_accuracy", self.validation_accuracy, prog_bar=True)


    def test_step(self, batch_data, batch_index):
        outputs = self.model(batch_data["input_ids"],
                             attention_mask=batch_data["attention_mask"],
                             labels=batch_data["label"])


        # with torch.inference_mode():
        logits = outputs["logits"]
        predictions = torch.argmax(logits, dim=1)
        self.test_accuracy(predictions, batch_data["label"])
        self.log("test_accuracy", self.test_accuracy, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.trainer.model.parameters(), lr=self.learning_rate
        )
        return optimizer

## 5.2. Finetuning the Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_CLASSES)
lightning_model = LightningModel(model, learning_rate=0.001)

callbacks = [ModelCheckpoint(save_top_k=1, mode="max", monitor="validation_accuracy")]      # save top 1 model

logger = CSVLogger(save_dir="logs/", name="distil_bert_imdb_sentiment_classification")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = L.Trainer(max_epochs=3,
                    callbacks=callbacks,
                    accelerator="gpu",
                    devices=1,
                    logger=logger,
                    log_every_n_steps=10,
                    deterministic=True)         ## To ensure full reproducibility -sets a seed value

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
start = time.time()

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

end = time.time()
elapsed = end - start
print(f"Time elapsed {elapsed / 60: .2f} mins")

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name                | Type                                | Params | Mode 
------------------------------------------------------------------------------------
0 | model               | DistilBertForSequenceClassification | 67.0 M | eval 
1 | train_accuracy      | MulticlassAccuracy                  | 0      | train
2 | validation_accuracy | MulticlassAccuracy                  | 0      | train
3 | test_accuracy       | MulticlassAccuracy                  | 0      | train
------------------------------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)
3         Modules in train mode
96        Modules in eval mode
INFO:lightning.pytorch.callbacks.model_summary:
  | Name                | Type                          

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=3` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Time elapsed  81.84 mins


## 5.3 Model Evaluation on Test Dataset

In [None]:
test_accuracy = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
print(f"Test Accuracy: {test_accuracy}%")

INFO: Restoring states from the checkpoint path at logs/distil_bert_imdb_sentiment_classification/version_4/checkpoints/epoch=0-step=2916.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Restoring states from the checkpoint path at logs/distil_bert_imdb_sentiment_classification/version_4/checkpoints/epoch=0-step=2916.ckpt
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loaded model weights from the checkpoint at logs/distil_bert_imdb_sentiment_classification/version_4/checkpoints/epoch=0-step=2916.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Loaded model weights from the checkpoint at logs/distil_bert_imdb_sentiment_classification/version_4/checkpoints/epoch=0-step=2916.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

Test Accuracy: [{'test_accuracy': 0.5008003115653992}]%


In [None]:
with open(os.path.join(trainer.logger.log_dir, "outputs.txt"), "w") as fp:
    fp.write(f"Time Elapsed {elapsed / 60: .2f} min\n")
    fp.write(f"Test Accuracy: {test_accuracy}")

# 6. Using Automatic Mixed Precision Training

If our GPU supports mixed precision training, enabling it is often one of the main `ways to boost computational efficiency`. In particular, we use automatic mixed precision training, which `switches between 32-bit and 16-bit floating point representations during training` without sacrificing accuracy.

> FP32 weights are converted to FP16 weights for gradient calculation, then the computed F16 gradients are converted back to FP32 gradients for furthur optimization.

## 6.1 Defining the Trainer Object

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_CLASSES)
lightning_model = LightningModel(model, learning_rate=0.001)

callbacks = [ModelCheckpoint(save_top_k=1, mode="max", monitor="validation_accuracy")]      # save top 1 model

logger = CSVLogger(save_dir="logs/", name="distil_bert_imdb_sentiment_classification")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Using the Trainer class, we can enable automatic mixed precision training with one line of code

trainer = L.Trainer(max_epochs=3,
                    callbacks=callbacks,
                    logger=logger,
                    log_every_n_steps=10,
                    deterministic=True,
                    accelerator="gpu",
                    devices=1,
                    precision="16"                ## Applying AMP!!
                    )

/usr/local/lib/python3.11/dist-packages/lightning/fabric/connector.py:572: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


## 6.2 Finetuning the Model

In [None]:
start = time.time()

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

end = time.time()
elapsed = end - start
print(f"Time elapsed {elapsed / 60: .2f} mins")

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name                | Type                                | Params | Mode 
------------------------------------------------------------------------------------
0 | model               | DistilBertForSequenceClassification | 67.0 M | eval 
1 | train_accuracy      | MulticlassAccuracy                  | 0      | train
2 | validation_accuracy | MulticlassAccuracy                  | 0      | train
3 | test_accuracy       | MulticlassAccuracy                  | 0      | train
------------------------------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)
3         Modules in train mode
96        Modules in eval mode
INFO:lightning.pytorch.callbacks.model_summary:
  | Name                | Type                          

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=3` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Time elapsed  22.94 mins


## 6.3 Model Evaluation on Test Dataset

In [None]:
test_accuracy = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
print(f"Test Accuracy: {test_accuracy}%")

INFO: Restoring states from the checkpoint path at logs/distil_bert_imdb_sentiment_classification/version_0/checkpoints/epoch=0-step=2916.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Restoring states from the checkpoint path at logs/distil_bert_imdb_sentiment_classification/version_0/checkpoints/epoch=0-step=2916.ckpt
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: Loaded model weights from the checkpoint at logs/distil_bert_imdb_sentiment_classification/version_0/checkpoints/epoch=0-step=2916.ckpt
INFO:lightning.pytorch.utilities.rank_zero:Loaded model weights from the checkpoint at logs/distil_bert_imdb_sentiment_classification/version_0/checkpoints/epoch=0-step=2916.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

Test Accuracy: [{'test_accuracy': 0.5008003115653992}]%


# 7. Using Static Graphs with Torch.Compile

`toch.compile()` function can speed up PyTorch code execution by generating `optimized static graphs` instead of running PyTorch code with dynamic graphs (the so-called eager mode). Under the hood, this is a 3-step process including graph acquisition, graph lowering, and graph compilation.

## 7.1 Applying Two Tricks

- Placing the compilation before the timing starts
- Priming the model with an example batch.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=NUM_CLASSES)
model.to(torch.device("cuda:0"))
model = torch.compile(model)

In [None]:
for batch_index, batch_data in enumerate(train_loader):
    model.train()

    for data_type in ["input_ids", "attention_mask", "label"]:
        batch_data[data_type] = batch_data[data_type].to(DEVICE)        ## Moving "input_ids", "attention_mask", "label" to device

    break       ## Break dataloader loop. Just taking the 1st batch for priming the model

### FORWARD PASS
outputs = model(batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                labels=batch_data["label"])

## 7.2 Defining the Trainer Object

In [None]:
lightning_model = LightningModel(model)

callbacks = [ModelCheckpoint(save_top_k=1, mode="max", monitor="validation_accuracy")]      # save top 1 model

logger = CSVLogger(save_dir="logs/", name="distil_bert_imdb_sentiment_classification")

In [None]:
# There will be no change here

trainer = L.Trainer(max_epochs=3,
                    callbacks=callbacks,
                    logger=logger,
                    log_every_n_steps=10,
                    deterministic=True,
                    accelerator="gpu",
                    devices=1
                    precision="16"                ## Applying AMP!!
                    )

## 7.3 Finetuning the Model



In [None]:
start = time.time()

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

end = time.time()
elapsed = end - start
print(f"Time elapsed {elapsed / 60: .2f} mins")

## 7.3 Model Evaluation on Test Dataset

In [None]:
test_accuracy = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")
print(f"Test Accuracy: {test_accuracy}%")

# 8. Using Fabric with Plain Old PyTorch Code

In [None]:
from lightning import Fabric

In [None]:
def plot_logs(log_dir):
    metrics = pd.read_csv(op.join(log_dir, "metrics.csv"))

    aggreg_metrics = []
    agg_col = "epoch"
    for i, dfg in metrics.groupby(agg_col):
        agg = dict(dfg.mean())
        agg[agg_col] = i
        aggreg_metrics.append(agg)

    df_metrics = pd.DataFrame(aggreg_metrics)
    df_metrics[["train_loss", "val_loss"]].plot(
        grid=True, legend=True, xlabel="Epoch", ylabel="Loss"
    )
    plt.savefig(op.join(log_dir, "loss.pdf"))

    df_metrics[["train_acc", "val_acc"]].plot(
        grid=True, legend=True, xlabel="Epoch", ylabel="Accuracy"
    )
    plt.savefig(op.join(log_dir, "acc.pdf"))

## 8.2. Defining the Training Function with Fabric

In [None]:
def train(model, optimizer,
          train_loader, val_loader, fabric):

    for epoch in tqdm(range(NUM_EPOCHS)):
        train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES).to(fabric.device)

        for batch_index, batch_data in enumerate(train_loader):
            model.train()

            ### FORWARD PASS AND BACKPROP
            outputs = model(batch_data["input_ids"],
                           attention_mask=batch_data["attention_mask"],
                           labels=batch_data["label"])

            optimizer.zero_grad()
            fabric.backward(outputs["loss"])
            optimizer.step()

            ### LOGGING
            if not batch_index % 300:
                print(f"Epoch: {epoch + 1: 04d} / {NUM_EPOCHS: 04d} |"
                      f"Batch {batch_index: 04d} / {len(train_loader): 04d} |"
                      f"Loss: {outputs['loss']:.4f}"
                )

            model.eval()
            with torch.inference_mode():
                predictions = torch.argmax(outputs["logits"], dim=1)
                train_acc.update(predictions, batch_data["label"])


        # VALIDATION LOOP
        model.eval()
        with torch.inference_mode():
            val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES).to(fabric.device)

            for batch_data in val_loader:
                outputs = model(batch_data["input_ids"],
                           attention_mask=batch_data["attention_mask"],
                           labels=batch_data["label"])

                predictions = torch.argmax(outputs["logits"], dim=1)
                val_acc.update(predictions, batch_data["label"])


        print(f"Epoch: {epoch + 1: 04d} / {NUM_EPOCHS: 04d} |"
              f"Train Accuracy: {train_acc.compute() * 100: .2f}% |"
              f"Val Accuracy: {val_acc.compute() * 100: .2f}%")

## 8.3. Finetuning the Model

In [None]:
# fabric = Fabric(accelerator="cuda", devices=4, strategy="deepspeed_stage_2", precision="16-mixed")
fabric = Fabric(accelerator="cuda", devices=1, precision="16-mixed")
fabric.launch()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, NUM_CLASSES)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
model, optimizer = fabric.setup(model, optimizer)

In [None]:
train_loader, val_loader, test_loader = fabric.setup_dataloaders(train_loader,
                                                                 val_loader,
                                                                 test_loader)

In [None]:
start = time.time()

train(model=model, optimizer=optimizer,
      train_loader=train_loader, val_loader=val_loader, fabric=fabric)

end = time.time()
elapsed = end - start
print(f"Time elapsed {elapsed / 60: .2f} mins")

## 8.4. Model Evaluation on Test Dataset

In [None]:
model.eval()
with torch.inference_mode():
    test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=NUM_CLASSES).to(fabric.device)

    for batch_data in test_loader:
        outputs = model(batch_data["input_ids"],
                    attention_mask=batch_data["attention_mask"],
                    labels=batch_data["label"])

        predictions = torch.argmax(outputs["logits"], dim=1)
        test_acc.update(predictions, batch_data["label"])

print(f"Test Accuracy: {test_acc.compute() * 100: .2f}%")