In [1]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)
     ---------------------------------------- 0.0/126.8 kB ? eta -:--:--
     --------- ----------------------------- 30.7/126.8 kB 1.4 MB/s eta 0:00:01
     ------------------------------------ - 122.9/126.8 kB 1.2 MB/s eta 0:00:01
     -------------------------------------- 126.8/126.8 kB 1.2 MB/s eta 0:00:00
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.10.3-cp39-cp39-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 42.0/42.0 kB 2.0 MB/s eta 0:00:00
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp39-none-win_amd64.w



In [1]:

from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

from transformers import AutoModelForSequenceClassification
import pandas as pd
import torch

from tools import download_dataset, load_dataset_into_to_dataframe, partition_dataset, IMDBDataset

## Load dataset

In [2]:
download_dataset()

data_frame = load_dataset_into_to_dataframe()
partition_dataset(data_frame)

100%|██████████| 50000/50000 [00:47<00:00, 1061.50it/s]


Class distribution:


In [3]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

## Tokenization

In [4]:
imdb_ds = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

print(imdb_ds)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


In [5]:
from transformers import AutoTokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(f"Tokenizer input max length: {tokenizer.model_max_length}")
print(f"Tokenizer vocabulary size: {tokenizer.vocab_size}")

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [7]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [10]:
imdb_tokenized = imdb_ds.map(tokenize_text, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
del imdb_ds

In [12]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [13]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Setup Dataloaders

In [14]:
train_ds = IMDBDataset(imdb_tokenized, partition_key="train")
val_ds = IMDBDataset(imdb_tokenized, partition_key="validation")
test_ds = IMDBDataset(imdb_tokenized, partition_key="test")

In [15]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=12,
    shuffle=True,
    num_workers=4,
)

val_loader = DataLoader(
    dataset=val_ds,
    batch_size=12,
    num_workers=4,
)
test_loader = DataLoader(
    dataset=test_ds,
    batch_size=12,
    num_workers=4,
)

## Initialize DistilBERT

In [15]:


model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
for param in model.parameters():
    param.requires_grad = False

model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [17]:
# unfreeze 2 last model layers (pre_classifier layer and classifier layer)

for param in model.pre_classifier.parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

## Finetuning DistilBERT

In [17]:
import torchmetrics

class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate=3e-5):
        super().__init__()
        self._learning_rate = learning_rate
        self._model = model

        self._val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self._test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, input_ids, attention_mask, labels):
        return self._model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
        self.log("val_loss", outputs["loss"], prog_bar=True)
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self._val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self._val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["label"])
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self._test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self._test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self._learning_rate)
        return optimizer


In [None]:

lt_model = LightningModel(model)

In [18]:
callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )
]
logger = CSVLogger(save_dir="logs/", name="lt_model")

In [None]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="auto",
    precision="16-mixed",
    devices="auto",
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(model=lt_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader,
            )

In [21]:
trainer.test(lt_model, dataloaders=train_loader, ckpt_path="best")

Restoring states from the checkpoint path at logs/lt_model\version_1\checkpoints\epoch=1-step=5834.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/lt_model\version_1\checkpoints\epoch=1-step=5834.ckpt
D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:492: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'accuracy': 0.8615142703056335}]

In [22]:
trainer.test(lt_model, dataloaders=val_loader, ckpt_path="best")

Restoring states from the checkpoint path at logs/lt_model\version_1\checkpoints\epoch=1-step=5834.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/lt_model\version_1\checkpoints\epoch=1-step=5834.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'accuracy': 0.8669999837875366}]

In [23]:
trainer.test(lt_model, dataloaders=test_loader, ckpt_path="best")

Restoring states from the checkpoint path at logs/lt_model\version_1\checkpoints\epoch=1-step=5834.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/lt_model\version_1\checkpoints\epoch=1-step=5834.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'accuracy': 0.8616999983787537}]

## Finetune all layers (better do it on gpu)

In [19]:
all_layers_finetune_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
lt_model2 = LightningModel(all_layers_finetune_model)

trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="auto",
    precision="16-mixed",
    devices="auto",
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(model=lt_model2,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader,
            )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                                | Params
------------------------------------------------------------------
0 | _model    | DistilBertForSequenceClassification | 67.0 M
1 | _val_acc  | MulticlassAccuracy                  | 0     
2 | _test_acc | MulticlassAccuracy                  | 0     
------------------------------------------------------------------
67.0 M    Trainable

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.
D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


In [21]:
trainer.test(lt_model2, dataloaders=train_loader, ckpt_path="best")

Restoring states from the checkpoint path at logs/lt_model\version_4\checkpoints\epoch=2-step=8751.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/lt_model\version_4\checkpoints\epoch=2-step=8751.ckpt
D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:492: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'test_dataloader' to speed up the dataloader worker initialization.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'accuracy': 0.9942285418510437}]

In [22]:
trainer.test(lt_model2, dataloaders=val_loader, ckpt_path="best")

Restoring states from the checkpoint path at logs/lt_model\version_4\checkpoints\epoch=2-step=8751.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/lt_model\version_4\checkpoints\epoch=2-step=8751.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'accuracy': 0.9336000084877014}]

In [23]:
trainer.test(lt_model2, dataloaders=test_loader, ckpt_path="best")

Restoring states from the checkpoint path at logs/lt_model\version_4\checkpoints\epoch=2-step=8751.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/lt_model\version_4\checkpoints\epoch=2-step=8751.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'accuracy': 0.9312999844551086}]