In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import f1_score
import numpy as np
from transformers import get_linear_schedule_with_warmup

# Define the Dataset class
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        # Remove extra dimensions added by the tokenizer (e.g., `input_ids` shape (1, N))
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Define collate_fn for DataLoader to handle padding in a batch
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad sequences to the same length (max length in batch)
    padded_input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    padded_attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    padded_labels = torch.tensor(labels)

    return {
        'input_ids': padded_input_ids,
        'attention_mask': padded_attention_mask,
        'labels': padded_labels
    }

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=28)  # Adjust num_labels based on GoEmotions dataset
model = model.to(device)

# Example data (replace with your GoEmotions dataset)
texts = ["I feel great!", "I am so sad."]
labels = [1, 0]  # Example labels (0 = sad, 1 = happy, etc.)

# Split dataset into train and validation sets (adjust accordingly)
train_texts = texts
train_labels = labels
val_texts = texts
val_labels = labels

# Parameters
max_len = 64
batch_size = 8
EPOCHS = 1000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create datasets and dataloaders
train_dataset = GoEmotionsDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = GoEmotionsDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5, correct_bias=True)  # Lower learning rate
epochs = 3

# Training function
def train_epoch(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        # Move the data to the correct device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

# Evaluation function
def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            # Move data to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())  # Move to CPU for F1 calculation
            all_labels.extend(labels.cpu().numpy())  # Move to CPU for F1 calculation

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return total_loss / len(data_loader), f1

# Training loop with scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # No warmup steps
    num_training_steps=len(train_loader) * epochs
)

for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_f1 = eval_model(model, val_loader, criterion, device)

    print(f"Epoch {epoch+1}: Train Loss: {train_loss}, Val Loss: {val_loss}, Val F1: {val_f1}")
    
    # Step the scheduler
    scheduler.step()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Train Loss: 4.002641201019287, Val Loss: 3.41208553314209, Val F1: 0.0
Epoch 2: Train Loss: 3.3801236152648926, Val Loss: 3.2364683151245117, Val F1: 0.0
Epoch 3: Train Loss: 3.1905314922332764, Val Loss: 3.1388280391693115, Val F1: 0.0
