### **Fine-tunning BERT Model for the the comments classification task**
According to the given requirements of the assignment, we took the BERT model and fine tune it for the Toxic comments classification.

In [1]:
#installing the necessory libraries
#Hugging Face Transformers is an open source library that allow us to interact and use the state-of-the art LLM models such as BERT, GPT, DistliBERT, etc.
!pip install transformers
!pip install pytorch-lightning #this library used as to traineer the model effeciently

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.2.1-py3-none-any.whl (801 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m801.6/801.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.3.2-py3-none-any.whl (841 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->pytorch-lightning)
  Downloading nvidia_cuda_runtime

In [2]:
#importing all necessary libraries
import pandas as pd #for loading .csv files
import numpy as np  #using for multi dimensional arraies

from tqdm.auto import tqdm #To show the progress of the model

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Loading and display the first few rows of the toxic_comments dataset from the local direcotry that is already downloaded
#This is toxic comments classification dataset avaliable on Kaggle.
df = pd.read_csv("/content/drive/MyDrive/LLM-Assignment/toxic_comments.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
#splitting the whole data into training and test part in order to train and validate the model performance
train_df, val_df = train_test_split(df, test_size=0.05)
train_df.shape, val_df.shape

((151592, 8), (7979, 8))

In [6]:
#Taking the last six classes for classification which toxic, severe_toxic,	obscene,	threat,	insult, and	identity_hate
LABEL_COLUMNS = df.columns.tolist()[2:]

In [7]:
#Displaying the number of toxic and clean comments for the purpose to analyze the dataset
train_toxic = train_df[train_df[LABEL_COLUMNS].sum(axis=1) > 0]
train_clean = train_df[train_df[LABEL_COLUMNS].sum(axis=1) == 0]
print(train_toxic.shape)
print(train_clean.shape)

(15454, 8)
(136138, 8)


In [8]:
#It is clearly seen from the above value for taxic and clean comments that the number of clean sample is larger than that toxic.
#Therefore, we randomly sample 15000 rows from train_clean set and then concatenate it with toxic one
#We did it because there very large number of samples in the clean set which oftenly crashed the colab when we start the training process.
train_df = pd.concat([
  train_toxic,
  train_clean.sample(15000)
])

train_df.shape, val_df.shape

((30454, 8), (7979, 8))

In [9]:
#Perform tokenizing through BertTokenizer pre-trained tokenizer
#it do to tokenization for bert-based-cased model
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
#Defining the max-number of token that can be input to the Bert.
#We set the Max token size to 512 if any input exceeds this token count, it will be either truncated or will be split into smaller parts
MAX_TOKEN_COUNT = 512

In [11]:
#Inhereting PyTorch Dataset for handling toxic comments data accordingly.
class ToxicCommentsDataset(Dataset):
  def __init__(self, data: pd.DataFrame, tokenizer: BertTokenizer, max_token_len: int = 128):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    comment_text = data_row.comment_text
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
      comment_text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      comment_text=comment_text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels=torch.FloatTensor(labels)
    )

In [12]:
#Initlizing BERT using the pretrained method provided by the BERT Class
#the return_dic is set to true in order to get the output in the form of dictionary instead of tuple
#The initized BERT model is then store in bert_model variable which will be fine-tuned for the purpose of taxic comments classifications
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [13]:
#Derving ToxicCommentDataModule from pytorch lightning in order to prepare dataset for training, and validation
class ToxicCommentDataModule(pl.LightningDataModule):
  def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def setup(self, stage=None):
    self.train_dataset = ToxicCommentsDataset(
      self.train_df,
      self.tokenizer,
      self.max_token_len
    )

    self.test_dataset = ToxicCommentsDataset(
      self.test_df,
      self.tokenizer,
      self.max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True,
      num_workers=2
    )

  def val_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=2
    )
  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=2
    )

In [31]:
#Derving toxicCommentTagger from pytorch-lighting for the purpose of training, validating and testing the BERT model.
#Actually this facility can be provided by pytorch but the main problem with that is that oftenly crashed the colab beacuse of consuming to much resources (it happened in my case I don't know about others)
class ToxicCommentTagger(pl.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def on_training_epoch_end(self, outputs):

    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)


  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

In [15]:
# defining basic parameter for the model training
N_EPOCHS = 2
BATCH_SIZE = 8

#initializing the ToxicCommentDataModule by passing the relevant arguments
data_module = ToxicCommentDataModule(
  train_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

In [16]:
#Calculating the necessary parameters for the model training
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = total_training_steps // 5
warmup_steps, total_training_steps

(1522, 7612)

In [17]:
#instantiating the ToxicCommentTagger Class with necessary parameters to make it prepare for training
model = ToxicCommentTagger(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

In [18]:
#Defining a callback that save the best model during training based on the validation loss.
checkpoint_callback = ModelCheckpoint(
  dirpath="/content/drive/MyDrive/LLM-Assignment/checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [19]:
#Configuring the pytorch-lightning trainer object with basic parameters necessory for our model training
#this can also be done with pytorch trainer however that consume a lot of resource and therefore oftnely crashed the colab
trainer = pl.Trainer(
  max_epochs=N_EPOCHS,
  callbacks=[checkpoint_callback],
  devices='auto',
  accelerator = 'auto',
  enable_progress_bar=True
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [20]:
#initializing the training process by passing the model (an object of the ToxicCommentTagger Class) to the to the trainer.fit function of pytroch-lightning
trainer.fit(model, data_module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 108 M 
1 | classifier | Linear    | 4.6 K 
2 | criterion  | BCELoss   | 0     
-----------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.260   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 3807: 'val_loss' reached 0.04987 (best 0.04987), saving model to '/content/drive/MyDrive/LLM-Assignment/checkpoints/best-checkpoint.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 7614: 'val_loss' reached 0.04620 (best 0.04620), saving model to '/content/drive/MyDrive/LLM-Assignment/checkpoints/best-checkpoint.ckpt' as top 1
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


In [36]:
#loading the best model from the checkpionts
trained_model = ToxicCommentTagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=len(LABEL_COLUMNS)
)
trained_model.eval()
trained_model.freeze()
trained_model = trained_model.to('cpu')

In [41]:
#inferencing the model performance on our given comments
toxic_comment = "Hello man, what the hill you are doing. You're such an idiot"
clean_comment = 'Hello this product is not good but I still like it.'

def comment_classification(comments):
  encoding = tokenizer.encode_plus(
      comments,
      add_special_tokens=True,
      max_length=512,
      return_token_type_ids=False,
      padding="max_length",
      return_attention_mask=True,
      return_tensors='pt',
    )
  _, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
  test_prediction = test_prediction.flatten().cpu().numpy()
  return test_prediction

#calling the function for toxic_comment
print("This is the classification of toxic comment\n")
for label, prediction in zip(LABEL_COLUMNS, comment_classification(toxic_comment)):
  print(f"{label}: {prediction}")

#calling the function for clean_comment
print("--------------*******************************************--------------------")
print("This is the classification of clean comment\n")
for label, prediction in zip(LABEL_COLUMNS, comment_classification(clean_comment)):
  print(f"{label}: {prediction}")

This is the classification of toxic comment

toxic: 0.9229974746704102
severe_toxic: 0.012365893460810184
obscene: 0.3287990987300873
threat: 0.0016648900927975774
insult: 0.8973225355148315
identity_hate: 0.00928434543311596
--------------*******************************************--------------------
This is the classification of clean comment

toxic: 0.0028382539749145508
severe_toxic: 0.0023031204473227262
obscene: 0.0017732540145516396
threat: 0.001856049057096243
insult: 0.002518836408853531
identity_hate: 0.0018867546459659934
