# Comparing the original dataset vs balanced dataset

In [10]:
!pip install toxy-bot==0.1.27



In [11]:
import comet_ml

import torch
from google.colab import userdata

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import CometLogger

import os

from toxy_bot.ml.datamodule import AutoTokenizerDataModule
from toxy_bot.ml.module import SequenceClassificationModule
from toxy_bot.ml.utils import create_dirs, create_experiment_name
from toxy_bot.ml.config import Config, DataModuleConfig, ModuleConfig, TrainerConfig

First, let's configure some basic settings

In [12]:
# Model and dataset
model_name = "google/bert_uncased_L-2_H-128_A-2" # BertTiny (4M)
lr = ModuleConfig.learning_rate

dataset_name = DataModuleConfig.dataset_name
train_size = DataModuleConfig.train_size
batch_size = DataModuleConfig.batch_size
max_length = DataModuleConfig.max_length

print(f"Model: {model_name}")
print(f"Learning rate: {lr}")
print()
print(f"Dataset: {dataset_name}")
print(f"Train size: {train_size}")
print(f"Batch size: {batch_size}")
print(f"Max length: {max_length}")

Model: google/bert_uncased_L-2_H-128_A-2
Learning rate: 3e-05

Dataset: anitamaxvim/jigsaw-toxic-comments
Train size: 0.85
Batch size: 64
Max length: 256


In [13]:
# Paths
this_nb = os.getcwd()

cache_dir = os.path.join(this_nb, "data")
log_dir = os.path.join(this_nb, "logs")
ckpt_dir = os.path.join(this_nb, "checkpoints")
perf_dir = os.path.join(log_dir, "perf")

print(f"Cache dir: {cache_dir}")
print(f"Log dir: {log_dir}")
print(f"Checkpoint dir: {ckpt_dir}")
print(f"Perf dir: {perf_dir}")

torch.set_float32_matmul_precision(precision="medium")

# Create required directories
create_dirs([cache_dir, log_dir, ckpt_dir, perf_dir])

Cache dir: /content/data
Log dir: /content/logs
Checkpoint dir: /content/checkpoints
Perf dir: /content/logs/perf


In [14]:
# Validation and logging
check_val_every_n_epoch = TrainerConfig.check_val_every_n_epoch
val_check_interval = TrainerConfig.val_check_interval
log_every_n_steps = TrainerConfig.log_every_n_steps

print(f"Check val every n epoch: {check_val_every_n_epoch}")
print(f"Val check interval: {val_check_interval}")
print(f"Log every n steps: {log_every_n_steps}")

Check val every n epoch: 1
Val check interval: 0.25
Log every n steps: 200


Now, we can define our LightningDataModule, which will be used by Trainer for its DataLoaders.

In [None]:
for train_split in ["train", "balanced_train"]:

    lit_datamodule = AutoTokenizerDataModule(
        model_name=model_name,
        dataset_name=dataset_name,
        cache_dir=cache_dir,
        batch_size=batch_size,
        max_length=max_length,
        train_split=train_split,
        train_size=train_size,
    )

    lit_model = SequenceClassificationModule(
        model_name=model_name,
        learning_rate=lr,
    )

    callbacks = [
        ModelCheckpoint(dirpath=ckpt_dir, filename=train_split),
    ]

    comet_logger = CometLogger(
        api_key=userdata.get("COMET_API_KEY"),
        project_name="toxy-bot-compare-datasets",
        workspace="anitamaxvim",
        name=train_split
    )

    lit_trainer = pl.Trainer(
        accelerator="auto",
        devices="auto",
        strategy="auto",
        precision="16-mixed",
        max_epochs=3,
        logger=comet_logger,
        callbacks=callbacks,
        val_check_interval=val_check_interval,
        log_every_n_steps=log_every_n_steps,
        check_val_every_n_epoch=check_val_every_n_epoch,
    )

    lit_trainer.fit(model=lit_model, datamodule=lit_datamodule)
    comet_logger.experiment.end()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/anitamaxvim/toxy-bot-compare-datasets/1052613cd9ef4a12a61823442522b8b8

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: Seed set to 42
INFO:lightning.fabr

Map:   0%|          | 0/135635 [00:00<?, ? examples/s]

In [None]:
# models = [
    # "google/bert_uncased_L-2_H-128_A-2" # BertTiny (4M)
    # "google/bert_uncased_L-4_H-256_A-4", # BertMini (?)
    # "google/bert_uncased_L-4_H-512_A-8", # BertSmall (29M)
    # "google/bert_uncased_L-8_H-512_A-8", # BertMedium (42M)
    # "google/bert_uncased_L-12_H-768_A-12", # BertBase (108M)
# ]
