# Comparing BERT Models

In [None]:
!pip install toxy-bot==0.1.30

Collecting toxy-bot==0.1.30
  Downloading toxy_bot-0.1.30-py3-none-any.whl.metadata (2.6 kB)
Collecting comet-ml>=3.49.7 (from toxy-bot==0.1.30)
  Downloading comet_ml-3.49.7-py3-none-any.whl.metadata (4.1 kB)
Collecting datasets>=3.5.0 (from toxy-bot==0.1.30)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dotenv>=0.9.9 (from toxy-bot==0.1.30)
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting jsonargparse>=4.38.0 (from toxy-bot==0.1.30)
  Downloading jsonargparse-4.38.0-py3-none-any.whl.metadata (12 kB)
Collecting lightning>=2.5.1 (from toxy-bot==0.1.30)
  Downloading lightning-2.5.1-py3-none-any.whl.metadata (39 kB)
Collecting plotly>=6.0.1 (from toxy-bot==0.1.30)
  Downloading plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting pydantic>=2.11.2 (from toxy-bot==0.1.30)
  Downloading pydantic-2.11.2-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m2.4 M

In [None]:
import comet_ml

import torch
from google.colab import userdata

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import CometLogger

import os

from toxy_bot.ml.datamodule import AutoTokenizerDataModule
from toxy_bot.ml.module import SequenceClassificationModule
from toxy_bot.ml.utils import create_dirs, create_experiment_name
from toxy_bot.ml.config import Config, DataModuleConfig, ModuleConfig, TrainerConfig

First, let's configure some basic settings

In [None]:
# Model and dataset
lr = ModuleConfig.learning_rate

dataset_name = DataModuleConfig.dataset_name
train_size = DataModuleConfig.train_size
train_split = DataModuleConfig.train_split
batch_size = DataModuleConfig.batch_size
max_length = DataModuleConfig.max_length

print(f"Learning rate: {lr}")
print()
print(f"Dataset: {dataset_name}")
print(f"Train size: {train_size}")
print(f"Batch size: {batch_size}")
print(f"Max length: {max_length}")

Learning rate: 3e-05

Dataset: anitamaxvim/jigsaw-toxic-comments
Train size: 0.85
Batch size: 64
Max length: 256


In [None]:
# Paths
this_nb = os.getcwd()

cache_dir = os.path.join(this_nb, "data")
log_dir = os.path.join(this_nb, "logs")
ckpt_dir = os.path.join(this_nb, "checkpoints")
perf_dir = os.path.join(log_dir, "perf")

print(f"Cache dir: {cache_dir}")
print(f"Log dir: {log_dir}")
print(f"Checkpoint dir: {ckpt_dir}")
print(f"Perf dir: {perf_dir}")

torch.set_float32_matmul_precision(precision="medium")

# Create required directories
create_dirs([cache_dir, log_dir, ckpt_dir, perf_dir])

Cache dir: /content/data
Log dir: /content/logs
Checkpoint dir: /content/checkpoints
Perf dir: /content/logs/perf


In [None]:
# Validation and logging
check_val_every_n_epoch = TrainerConfig.check_val_every_n_epoch
val_check_interval = TrainerConfig.val_check_interval
log_every_n_steps = TrainerConfig.log_every_n_steps

print(f"Check val every n epoch: {check_val_every_n_epoch}")
print(f"Val check interval: {val_check_interval}")
print(f"Log every n steps: {log_every_n_steps}")

Check val every n epoch: 1
Val check interval: None
Log every n steps: 200


Now, we can define our LightningDataModule, which will be used by Trainer for its DataLoaders.

In [None]:
models = [
    "google/bert_uncased_L-2_H-128_A-2", # BertTiny (4M)
    "google/bert_uncased_L-8_H-512_A-8", # BertMedium (42M)
    "google/bert_uncased_L-12_H-768_A-12", # BertBase (108M)
]

for model in models:

    lit_datamodule = AutoTokenizerDataModule(
        model_name=model,
        dataset_name=dataset_name,
        cache_dir=cache_dir,
        batch_size=batch_size,
        max_length=max_length,
        train_split=train_split,
        train_size=train_size,
    )

    lit_model = SequenceClassificationModule(
        model_name=model,
        learning_rate=lr,
    )

    callbacks = [
        ModelCheckpoint(dirpath=ckpt_dir, filename=model),
    ]

    comet_logger = CometLogger(
        api_key=userdata.get("COMET_API_KEY"),
        project_name="toxy-bot-compare-models",
        workspace="anitamaxvim",
        name=model,
    )

    lit_trainer = pl.Trainer(
        accelerator="auto",
        devices="auto",
        strategy="auto",
        precision="16-mixed",
        max_epochs=5,
        logger=comet_logger,
        callbacks=callbacks,
        val_check_interval=val_check_interval,
        log_every_n_steps=log_every_n_steps,
        check_val_every_n_epoch=check_val_every_n_epoch,
    )

    lit_trainer.fit(model=lit_model, datamodule=lit_datamodule)
    comet_logger.experiment.end()

config.json:   0%|          | 0.00/382 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/382 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/anitamaxvim/toxy-bot-compare-models/1ba40a045b0e49abae8cdf0776828ae9

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: Seed set to 42
INFO:lightning.fabric

README.md:   0%|          | 0.00/4.80k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/41.1M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

balanced_train.parquet:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/159571 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/63978 [00:00<?, ? examples/s]

Generating balanced_train split:   0%|          | 0/54083 [00:00<?, ? examples/s]

Map:   0%|          | 0/45970 [00:00<?, ? examples/s]

Map:   0%|          | 0/8113 [00:00<?, ? examples/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name                | Type                          | Params | Mode 
------------------------------------------------------------------------------
0 | model               | BertForSequenceClassification | 4.4 M  | eval 
1 | accuracy            | MultilabelAccuracy            | 0      | train
2 | f1_score            | MultilabelF1Score             | 0      | train
3 | precision           | MultilabelPrecision           | 0      | train
4 | recall              | MultilabelRecall              | 0      | train
5 | macro_avg_accuracy  | MultilabelAccuracy            | 0      | train
6 | macro_avg_f1_score  | MultilabelF1Score             | 0      | train
7 | macro_avg_precision | MultilabelPrecision           | 0      | train
8 | macro_avg_recall    | MultilabelRecall              | 0      | train
-----------------------------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : google/bert_uncased_L-2_H-128_A-2
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/anitamaxvim/toxy-bot-compare-models/1ba40a045b0e49abae8cdf0776828ae9
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [360]         : (5170.59228515625, 44697.44921875)
[1;38;5;39mCOMET INFO:[0m     train_loss [17]    : (0.05861299857497215, 0.4067456126213074)
[1;38;5;39mCOMET INFO:[0m     val_loss [5]      

config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/167M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-8_H-512_A-8 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/anitamaxvim/toxy-bot-compare-models/af5092832c4640ccafe2cce92b1877fb

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: Seed set to 42
INFO:lightning.fabric.utilities.seed:Seed set to 42
INFO:pytorch_lightning.utilities.rank_zero:[2025-04-07 16:35:53.866805] Dataset anitamaxvim/jigsaw-toxic-comments exists in cache. Loading from cache.


Map:   0%|          | 0/45970 [00:00<?, ? examples/s]

Map:   0%|          | 0/8113 [00:00<?, ? examples/s]

/usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /content/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name                | Type                          | Params | Mode 
------------------------------------------------------------------------------
0 | model               | BertForSequenceClassification | 41.4 M | eval 
1 | accuracy            | MultilabelAccuracy            | 0      | train
2 | f1_score            | MultilabelF1Score             | 0      | train
3 | precision           | MultilabelPrecision           | 0      | train
4 | recall              | MultilabelRecall              | 0      | train
5 | macro_avg_accuracy  | MultilabelAccuracy            | 0      | train
6 | macro_avg_f1_score  | MultilabelF1Score             | 0      | train
7 | macro_avg_precision | MultilabelPre

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : google/bert_uncased_L-8_H-512_A-8
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/anitamaxvim/toxy-bot-compare-models/af5092832c4640ccafe2cce92b1877fb
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [360]         : (701.4452514648438, 51131.0703125)
[1;38;5;39mCOMET INFO:[0m     train_loss [17]    : (0.01760307140648365, 0.1241147369146347)
[1;38;5;39mCOMET INFO:[0m     val_loss [5]      

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/anitamaxvim/toxy-bot-compare-models/b15f99aa2581441892f336dd12cf4c91

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: Seed set to 42
INFO:lightning.fabric.utilities.seed:Seed set to 42
INFO:pytorch_lightning.utilities.rank_zero:[2025-04-07 16:51:16.381696] Dataset anitamaxvim/jigsaw-toxic-comments exists in cache. Loading from cache.


Map:   0%|          | 0/45970 [00:00<?, ? examples/s]

Map:   0%|          | 0/8113 [00:00<?, ? examples/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name                | Type                          | Params | Mode 
------------------------------------------------------------------------------
0 | model               | BertForSequenceClassification | 109 M  | eval 
1 | accuracy            | MultilabelAccuracy            | 0      | train
2 | f1_score            | MultilabelF1Score             | 0      | train
3 | precision           | MultilabelPrecision           | 0      | train
4 | recall              | MultilabelRecall              | 0      | train
5 | macro_avg_accuracy  | MultilabelAccuracy            | 0      | train
6 | macro_avg_f1_score  | MultilabelF1Score             | 0      | train
7 | macro_avg_precision | MultilabelPrecision           | 0      | train
8 | macro_avg_recall    | MultilabelRecall              | 0      | train
-----------------------------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
# models = [
    # "google/bert_uncased_L-2_H-128_A-2" # BertTiny (4M)
    # "google/bert_uncased_L-4_H-256_A-4", # BertMini (?)
    # "google/bert_uncased_L-4_H-512_A-8", # BertSmall (29M)
    # "google/bert_uncased_L-8_H-512_A-8", # BertMedium (42M)
    # "google/bert_uncased_L-12_H-768_A-12", # BertBase (108M)
# ]
