In this notebook we are implementing Roberta model:-

In [1]:
# this code works but does it really work?
!pip install transformers datasets torch scikit-learn pandas



In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/kaggle/input/sexism-classification-dataset-csv/sexism_classification_dataset.csv")

# Encode labels as integers
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label_vector"])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label_id"].tolist(), test_size=0.2, random_state=42
)

# Load RoBERTa tokenizer instead of BERT
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create Dataloaders
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load RoBERTa model instead of BERT
num_labels = len(label_encoder.classes_)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training function
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    predictions, true_labels = [], []
    
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Get predictions for training accuracy
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Calculate training accuracy
    train_accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), train_accuracy

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

# Training loop
epochs = 10
for epoch in range(epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_acc = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Val Accuracy: {val_acc:.4f}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda




Epoch 1/10, Loss: 1.7562, Train Accuracy: 0.3549, Val Accuracy: 0.4892
Epoch 2/10, Loss: 1.3002, Train Accuracy: 0.5297, Val Accuracy: 0.5304
Epoch 3/10, Loss: 1.0166, Train Accuracy: 0.6443, Val Accuracy: 0.5427
Epoch 4/10, Loss: 0.7273, Train Accuracy: 0.7517, Val Accuracy: 0.5273
Epoch 5/10, Loss: 0.5124, Train Accuracy: 0.8357, Val Accuracy: 0.4943
Epoch 6/10, Loss: 0.3340, Train Accuracy: 0.8996, Val Accuracy: 0.5304
Epoch 7/10, Loss: 0.2501, Train Accuracy: 0.9227, Val Accuracy: 0.5335
Epoch 8/10, Loss: 0.1521, Train Accuracy: 0.9598, Val Accuracy: 0.5273
Epoch 9/10, Loss: 0.1302, Train Accuracy: 0.9652, Val Accuracy: 0.5118
Epoch 10/10, Loss: 0.1182, Train Accuracy: 0.9642, Val Accuracy: 0.5314


Here we faced the same issue as our own bert model that is of overfitting problem but here we don't go over the same process as we did before 
Now we try to understand the dataset that as we have used different techniques but that does not improve accuracy
So we use create new embeddings based on the dataset and we try to improve its accuracy by combining both the bert embeddings and new embeddings 
wnow use new embedings to concatenate with bert and hence we'll have the features of both our embeddings and then we train dnn models to get our classification task done
I am first training a tf-idf based embeddings then concatanating to ouyr bert and then run a ffnn based architecture on those embeddings 