# Imports

In [1]:
import pandas as pd

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.nn import Sigmoid
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from tqdm import tqdm

# Set up 

The goal of this model is to find the optimal learning rate, which will use the same architecture as **Model 3** (DistilBERT for multi-label classification with focal loss) but with a modified learning rate. 

Given that training for 5 epochs takes around 8 hours, it's important to optimize the learning rate efficiently without conducting full-length training cycles. To achieve this, we will evaluate 6 different learning rates by training the model for a reduced number of epochs (3 epochs per learning rate). The goal is to identify which learning rate produces the best performance in terms of average loss.

# Model Four

## Model Architecture

In [35]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    problem_type="multi_label_classification", 
    num_labels=6)

class MultiLabelDistilBERT(nn.Module):
    def __init__(self, base_model):
        super(MultiLabelDistilBERT, self).__init__()
        self.base_model = base_model
        self.sigmoid = Sigmoid()

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = self.sigmoid(logits)
        return probs

model = MultiLabelDistilBERT(model)

print(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MultiLabelDistilBERT(
  (base_model): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

MultiLabelDistilBERT(
  (base_model): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

## Training Process

In [38]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe['cleaned_comment_text']  
        self.labels = dataframe[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment = str(self.comment_text.iloc[index])  
        inputs = self.tokenizer(
            comment,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(self.labels[index], dtype=torch.float)
        }

In [39]:
train_dataset = ToxicCommentsDataset(train_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [40]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return torch.mean(F_loss)

In [42]:
criterion = FocalLoss()

learning_rates = [1e-6, 1e-5, 1e-4, 5e-4, 1e-3, 5e-3]

num_epochs = 3

results_df = pd.DataFrame(columns=['Learning Rate', 'Epoch', 'Avg Loss'])

for lr in learning_rates:
    print(f"\nTesting learning rate: {lr}")
    
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        problem_type="multi_label_classification",
        num_labels=6
    )
    model = MultiLabelDistilBERT(model)
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=lr)
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        model.train()
        
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].float().to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Learning Rate: {lr}, Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
        
        new_row = pd.DataFrame([{
        'Learning Rate': lr,
        'Epoch': epoch+1,
        'Avg Loss': avg_loss
        }])
        results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.to_csv('learning_rate_results.csv', index=False)

print("\nLearning Rate Testing Complete!")
print(results_df)
#2749m 11.0s


Testing learning rate: 1e-06


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 9974/9974 [2:28:21<00:00,  1.12it/s]  


Learning Rate: 1e-06, Epoch 1/3, Average Loss: 1.2834


Training Epoch 2/3: 100%|██████████| 9974/9974 [2:27:00<00:00,  1.13it/s]  


Learning Rate: 1e-06, Epoch 2/3, Average Loss: 0.4868


Training Epoch 3/3: 100%|██████████| 9974/9974 [2:29:25<00:00,  1.11it/s]  


Learning Rate: 1e-06, Epoch 3/3, Average Loss: 0.4066

Testing learning rate: 1e-05


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 9974/9974 [2:29:56<00:00,  1.11it/s]  


Learning Rate: 1e-05, Epoch 1/3, Average Loss: 0.5185


Training Epoch 2/3: 100%|██████████| 9974/9974 [2:30:05<00:00,  1.11it/s]  


Learning Rate: 1e-05, Epoch 2/3, Average Loss: 0.3200


Training Epoch 3/3: 100%|██████████| 9974/9974 [2:29:52<00:00,  1.11it/s]  


Learning Rate: 1e-05, Epoch 3/3, Average Loss: 0.2621

Testing learning rate: 0.0001


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 9974/9974 [2:29:02<00:00,  1.12it/s]  


Learning Rate: 0.0001, Epoch 1/3, Average Loss: 0.7394


Training Epoch 2/3: 100%|██████████| 9974/9974 [2:29:19<00:00,  1.11it/s]  


Learning Rate: 0.0001, Epoch 2/3, Average Loss: 0.9385


Training Epoch 3/3: 100%|██████████| 9974/9974 [2:35:11<00:00,  1.07it/s]  


Learning Rate: 0.0001, Epoch 3/3, Average Loss: 1.0982

Testing learning rate: 0.0005


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 9974/9974 [2:39:00<00:00,  1.05it/s]  


Learning Rate: 0.0005, Epoch 1/3, Average Loss: 1.1334


Training Epoch 2/3: 100%|██████████| 9974/9974 [2:42:09<00:00,  1.03it/s]  


Learning Rate: 0.0005, Epoch 2/3, Average Loss: 1.1205


Training Epoch 3/3: 100%|██████████| 9974/9974 [2:38:34<00:00,  1.05it/s]  


Learning Rate: 0.0005, Epoch 3/3, Average Loss: 1.1174

Testing learning rate: 0.001


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 9974/9974 [2:33:01<00:00,  1.09it/s]  


Learning Rate: 0.001, Epoch 1/3, Average Loss: 1.1424


Training Epoch 2/3: 100%|██████████| 9974/9974 [2:33:18<00:00,  1.08it/s]  


Learning Rate: 0.001, Epoch 2/3, Average Loss: 1.1312


Training Epoch 3/3: 100%|██████████| 9974/9974 [2:33:30<00:00,  1.08it/s]  


Learning Rate: 0.001, Epoch 3/3, Average Loss: 1.1239

Testing learning rate: 0.005


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 9974/9974 [2:35:04<00:00,  1.07it/s]  


Learning Rate: 0.005, Epoch 1/3, Average Loss: 1.1903


Training Epoch 2/3: 100%|██████████| 9974/9974 [2:30:17<00:00,  1.11it/s]  


Learning Rate: 0.005, Epoch 2/3, Average Loss: 1.1140


Training Epoch 3/3: 100%|██████████| 9974/9974 [2:35:55<00:00,  1.07it/s]  

Learning Rate: 0.005, Epoch 3/3, Average Loss: 1.0886

Learning Rate Testing Complete!
    Learning Rate Epoch  Avg Loss
0        0.000001     1  1.283355
1        0.000001     2  0.486803
2        0.000001     3  0.406599
3        0.000010     1  0.518489
4        0.000010     2  0.319968
5        0.000010     3  0.262056
6        0.000100     1  0.739433
7        0.000100     2  0.938457
8        0.000100     3  1.098237
9        0.000500     1  1.133377
10       0.000500     2  1.120493
11       0.000500     3  1.117374
12       0.001000     1  1.142442
13       0.001000     2  1.131222
14       0.001000     3  1.123862
15       0.005000     1  1.190259
16       0.005000     2  1.113955
17       0.005000     3  1.088575





### Results for the model with `lr=5e-5`:
- **Epoch 1**: Average Loss = 0.0129
- **Epoch 2**: Average Loss = 0.0099
- **Epoch 3**: Average Loss = 0.0082
- **Epoch 4**: Average Loss = 0.0070
- **Epoch 5**: Average Loss = 0.0059

### Results from your learning rate testing:

| Learning Rate | Epoch 1 Avg Loss | Epoch 2 Avg Loss | Epoch 3 Avg Loss |
|---------------|------------------|------------------|------------------|
| `1e-6`        | 1.283355         | 0.486083         | 0.406599         |
| `1e-5`        | 0.518489         | 0.319068         | 0.262956         |
| `1e-4`        | 1.098237         | 0.938457         | 0.739433         |
| `5e-4`        | 1.133377         | 1.120493         | 1.117374         |
| `1e-3`        | 1.144442         | 1.142322         | 1.123862         |
| `5e-3`        | 1.190259         | 1.113955         | 1.088575         |

### Comparison:

- The model with `lr=5e-5` shows **significantly lower loss** compared to the learning rate tests, especially when compared to `1e-6` and `1e-5`. 
- The lowest loss achieved during testing was for **`1e-5`** with a loss of **0.262956** in epoch 3. However, with `5e-5`, your model achieved an even lower loss of **0.0059** in epoch 5, which indicates a much better performance.

The model trained with **`lr=5e-5`** provided significantly better results (lower loss) than any of the learning rates you tested earlier. Given the excellent results across all epochs, it seems that this learning rate is highly effective for this model.

# Conclusion 

Summary of Learning Rate Findings:

In the previous experiments, I conducted a search for the optimal learning rate by testing multiple values: `[1e-6, 1e-5, 1e-4, 5e-4, 1e-3, 5e-3]`. These learning rates were tested across 3 epochs each, and the average loss for each learning rate and epoch was tracked to determine the best-performing rate. The goal was to identify a learning rate that minimizes the loss and improves model performance.

However, when comparing these results with **Model 3**, which was trained as a benchmark with a learning rate of **5e-5**, the findings showed that the model trained with **5e-5** consistently achieved significantly lower losses across all epochs compared to the tested learning rates.

# Recommendations for Improvement

   - **Model Improvements:** 
     - Using more advanced models 
     - Hyperparameter tuning
     - Implementing different preprocessing techniques
     - Using ensemble methods
   - **Data Augmentation:** data augmentation techniques that could help with class imbalance or improve the model’s generalization.