In [None]:
import pandas as pd
from transformers import RobertaTokenizer
from sklearn.model_selection import train_test_split

# Load the CSV file
def load_data(file_path):
    df = pd.read_csv(file_path)
    texts = df["text"].tolist()
    labels = df[["anger", "fear", "joy", "sadness", "surprise"]].values
    return texts, labels

# Load train and test data
train_file = "Emotion_data/public_data_test/track_a/train/eng.csv"
test_file = "Emotion_data/public_data_test/track_a/dev/eng.csv"

train_texts, train_labels = load_data(train_file)
test_texts, test_labels = load_data(test_file)

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the text data
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

train_encodings = tokenize_texts(train_texts, tokenizer)
test_encodings = tokenize_texts(test_texts, tokenizer)


In [4]:
import torch
from torch.utils.data import Dataset

class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.float),
        }

# Create datasets
train_dataset = EmotionDataset(train_encodings, train_labels)
test_dataset = EmotionDataset(test_encodings, test_labels)


In [5]:
from transformers import RobertaModel, RobertaTokenizer
import torch

class RobertaClass(torch.nn.Module):
    def __init__(self, num_labels=5):
        super(RobertaClass, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, num_labels)  # 768 is the output dimension of Roberta

    def forward(self, input_ids, attention_mask):
        # Get the output from Roberta
        output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        # We only care about the [CLS] token output for classification (i.e., first token in the sequence)
        cls_output = output[0][:, 0, :]  # (batch_size, hidden_size) - [CLS] token representation
        
        # Apply dropout for regularization
        cls_output = self.dropout(cls_output)
        
        # Pass through the classifier to get the logits (raw output scores for each class)
        logits = self.classifier(cls_output)
        
        return logits


In [13]:
from torch.utils.data import DataLoader
from transformers import AdamW


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Example model initialization
model = RobertaClass(num_labels=5)  # Change num_labels to match your task
model.to(device)  # Ensure the model is on the correct device (CPU/GPU)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()  # BCEWithLogitsLoss for multi-label tasks
epochs = 5

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Example training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask)
        
        # Compute the loss
        loss = criterion(outputs, labels)  # Use BCEWithLogitsLoss for multi-label
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 0.5070696824547872
Epoch 2/5, Loss: 0.35576086564560155
Epoch 3/5, Loss: 0.2621441572215516
Epoch 4/5, Loss: 0.19024471710355295
Epoch 5/5, Loss: 0.14003596573292865


In [14]:
from sklearn.metrics import f1_score
import numpy as np

# Example for multi-label classification evaluation
model.eval()
all_preds = []
all_labels = []
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask)
        
        # Apply sigmoid and threshold to get predictions
        preds = torch.sigmoid(outputs) > 0.5  # For multi-label
        all_preds.append(preds.cpu().numpy())
        all_labels.append(labels.cpu().numpy())
    # Calculate average loss and accuracy
    test_loss /= len(test_loader)
    accuracy = correct / total

# Flatten lists and calculate F1 score
all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

f1 = f1_score(all_labels, all_preds, average='micro')  # Change to 'macro' if needed
print(f"F1 Score: {f1}")


  return x.astype(dtype, copy=copy, casting=casting)
  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Input y_true contains NaN.