In [1]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/kaggle/input/sexism-classification-dataset-csv/sexism_classification_dataset.csv")

# Encode labels as integers
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label_vector"])

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Training function
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    predictions, true_labels = [], []
    
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Get predictions
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Calculate accuracy
    train_accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), train_accuracy

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

# K-Fold Cross Validation Implementation
def k_fold_cross_validation(df, num_folds=5, epochs=5, batch_size=16, learning_rate=1e-5, weight_decay=0.01):
    # Prepare data
    texts = df["text"].tolist()
    labels = df["label_id"].tolist()
    
    # Initialize k-fold
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Track metrics across folds
    fold_train_losses = []
    fold_train_accs = []
    fold_val_accs = []
    
    # For early stopping
    best_val_acc = 0
    best_model_state = None
    
    # Run K-fold cross validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(texts)):
        print(f"\nFold {fold+1}/{num_folds}")
        
        # Split data
        fold_train_texts = [texts[i] for i in train_idx]
        fold_train_labels = [labels[i] for i in train_idx]
        fold_val_texts = [texts[i] for i in val_idx]
        fold_val_labels = [labels[i] for i in val_idx]
        
        # Create datasets and dataloaders
        train_dataset = TextDataset(fold_train_texts, fold_train_labels, tokenizer)
        val_dataset = TextDataset(fold_val_texts, fold_val_labels, tokenizer)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        # Initialize model for this fold
        num_labels = len(label_encoder.classes_)
        model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", 
            num_labels=num_labels,
            hidden_dropout_prob=0.3,
            attention_probs_dropout_prob=0.3
        )
        model.to(device)
        
        # Optimizer
        optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        criterion = torch.nn.CrossEntropyLoss()
        
        # Training loop for this fold
        fold_best_val_acc = 0
        patience = 3
        no_improve_count = 0
        
        for epoch in range(epochs):
            train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
            val_acc = evaluate(model, val_loader, device)
            
            print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")
            
            # Check for improvement
            if val_acc > fold_best_val_acc:
                fold_best_val_acc = val_acc
                no_improve_count = 0
                
                # Track best model overall
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_model_state = model.state_dict().copy()
            else:
                no_improve_count += 1
            
            # Early stopping check
            if no_improve_count >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
        
        # Record metrics for this fold
        fold_train_losses.append(train_loss)
        fold_train_accs.append(train_acc)
        fold_val_accs.append(fold_best_val_acc)
    
    # Print cross-validation results
    print("\nCross-Validation Results:")
    print(f"Average Train Loss: {np.mean(fold_train_losses):.4f}")
    print(f"Average Train Accuracy: {np.mean(fold_train_accs):.4f}")
    print(f"Average Validation Accuracy: {np.mean(fold_val_accs):.4f}")
    print(f"Best Validation Accuracy: {best_val_acc:.4f}")
    
    # Load best model
    final_model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels=num_labels,
        hidden_dropout_prob=0.3,
        attention_probs_dropout_prob=0.3
    )
    final_model.load_state_dict(best_model_state)
    final_model.to(device)
    
    return final_model, best_val_acc

# Run k-fold cross validation
num_folds = 5  # You can adjust this
epochs_per_fold = 5  # Fewer epochs per fold to manage computational resources
batch_size = 16
learning_rate = 5e-6  # Lower learning rate
weight_decay = 0.01

best_model, best_acc = k_fold_cross_validation(
    df,
    num_folds=num_folds,
    epochs=epochs_per_fold,
    batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay
)

print(f"\nTraining complete! Best validation accuracy: {best_acc:.4f}")

# If you want to save the best model
model_save_path = "best_bert_model.pt"
torch.save(best_model.state_dict(), model_save_path)
print(f"Best model saved to {model_save_path}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using device: cuda

Fold 1/5


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 2.2038, Train Acc: 0.2037, Val Acc: 0.2987
Epoch 2/5, Loss: 1.9841, Train Acc: 0.3013, Val Acc: 0.3296
Epoch 3/5, Loss: 1.8723, Train Acc: 0.3309, Val Acc: 0.3841
Epoch 4/5, Loss: 1.7702, Train Acc: 0.3701, Val Acc: 0.4099
Epoch 5/5, Loss: 1.6892, Train Acc: 0.3966, Val Acc: 0.4408

Fold 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 2.1953, Train Acc: 0.2024, Val Acc: 0.2626
Epoch 2/5, Loss: 1.9865, Train Acc: 0.2895, Val Acc: 0.3234
Epoch 3/5, Loss: 1.8406, Train Acc: 0.3407, Val Acc: 0.3811
Epoch 4/5, Loss: 1.7149, Train Acc: 0.3804, Val Acc: 0.3893
Epoch 5/5, Loss: 1.6185, Train Acc: 0.4182, Val Acc: 0.4089

Fold 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 2.2057, Train Acc: 0.2063, Val Acc: 0.2719
Epoch 2/5, Loss: 2.0312, Train Acc: 0.2704, Val Acc: 0.3223
Epoch 3/5, Loss: 1.8897, Train Acc: 0.3240, Val Acc: 0.3821
Epoch 4/5, Loss: 1.7675, Train Acc: 0.3685, Val Acc: 0.4222
Epoch 5/5, Loss: 1.6654, Train Acc: 0.4133, Val Acc: 0.4501

Fold 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 2.2025, Train Acc: 0.1926, Val Acc: 0.2667
Epoch 4/5, Loss: 1.7636, Train Acc: 0.3546, Val Acc: 0.3563
Epoch 5/5, Loss: 1.6563, Train Acc: 0.3953, Val Acc: 0.4295

Fold 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 2.1856, Train Acc: 0.2070, Val Acc: 0.2309
Epoch 2/5, Loss: 2.0089, Train Acc: 0.2752, Val Acc: 0.2979
Epoch 3/5, Loss: 1.8796, Train Acc: 0.3296, Val Acc: 0.3165
Epoch 4/5, Loss: 1.7624, Train Acc: 0.3610, Val Acc: 0.3557
Epoch 5/5, Loss: 1.6622, Train Acc: 0.4032, Val Acc: 0.4144

Cross-Validation Results:
Average Train Loss: 1.6583
Average Train Accuracy: 0.4053
Average Validation Accuracy: 0.4287
Best Validation Accuracy: 0.4501


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training complete! Best validation accuracy: 0.4501
Best model saved to best_bert_model.pt


In this we tried cross-validation but that also did not work in improving the accuracy so we go to dimensionality reduction techniques as they might reduce this overfitting and then we would get a better accuracy
So we implemented pca reduction technique and then train a ffnn based architecture on that
