In [1]:
# this code works but does it really work?
!pip install transformers datasets torch scikit-learn pandas



In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

# Load dataset
df = pd.read_csv("/kaggle/input/sexism-classification-dataset-csv/sexism_classification_dataset.csv")

# Encode labels as integers
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label_vector"])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label_id"].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# PART 1: FINE-TUNE BERT FIRST (as in original code)
# Dataset class for BERT fine-tuning
class BertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets and dataloaders for BERT fine-tuning
bert_train_dataset = BertDataset(train_texts, train_labels, tokenizer)
bert_val_dataset = BertDataset(val_texts, val_labels, tokenizer)
bert_train_loader = DataLoader(bert_train_dataset, batch_size=16, shuffle=True)
bert_val_loader = DataLoader(bert_val_dataset, batch_size=16, shuffle=False)

# Load BERT model for fine-tuning
num_labels = len(label_encoder.classes_)
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
bert_model.to(device)

# Optimizer and Loss
optimizer = AdamW(bert_model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training function for BERT
def train_bert(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = (
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["labels"].to(device)
        )
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Evaluation function for BERT
def evaluate_bert(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = (
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["labels"].to(device)
            )
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions), predictions, true_labels

# BERT fine-tuning loop
print("Step 1: Fine-tuning BERT for 3 epochs")
bert_epochs = 3
for epoch in range(bert_epochs):
    train_loss = train_bert(bert_model, bert_train_loader, optimizer, criterion, device)
    val_acc, _, _ = evaluate_bert(bert_model, bert_val_loader, device)
    print(f"BERT Fine-tuning - Epoch {epoch+1}/{bert_epochs}, Loss: {train_loss:.4f}, Val Accuracy: {val_acc:.4f}")

# Save the fine-tuned BERT model
torch.save(bert_model.state_dict(), "fine_tuned_bert.pt")

# PART 2: CREATE SVD EMBEDDINGS
# Create co-occurrence matrix and SVD embeddings
def create_svd_embeddings(texts, svd_dim=300):
    # Create co-occurrence matrix using CountVectorizer
    vectorizer = CountVectorizer(max_features=10000)
    count_matrix = vectorizer.fit_transform(texts)
    
    # Apply SVD for dimensionality reduction
    svd = TruncatedSVD(n_components=svd_dim, random_state=42)
    svd_embeddings = svd.fit_transform(count_matrix)
    
    return vectorizer, svd, svd_embeddings

# Create SVD embeddings with higher dimensions (300)
svd_dim = 300
print(f"Step 2: Creating SVD embeddings with dimension {svd_dim}")
vectorizer, svd_model, train_svd_embeddings = create_svd_embeddings(train_texts, svd_dim)
val_count_matrix = vectorizer.transform(val_texts)
val_svd_embeddings = svd_model.transform(val_count_matrix)

# PART 3: COMBINED MODEL WITH CNN
# Extract pre-trained BERT model without classification head
bert_base = BertModel.from_pretrained("bert-base-uncased")

# Copy weights from fine-tuned BERT to the base model
bert_base.load_state_dict({k.replace('bert.', ''): v for k, v in bert_model.state_dict().items() 
                          if 'bert.' in k}, strict=False)

# Dataset class for combined model
class CombinedDataset(Dataset):
    def __init__(self, texts, labels, svd_embeddings, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.svd_embeddings = svd_embeddings
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "svd_embeddings": torch.tensor(self.svd_embeddings[idx], dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# CNN model that applies convolutions after concatenating BERT and SVD embeddings
class BertSvdCnnModel(nn.Module):
    def __init__(self, bert_model, num_labels, svd_dim=300):
        super(BertSvdCnnModel, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        
        # Freeze BERT parameters to use pre-fine-tuned weights
        for param in self.bert.parameters():
            param.requires_grad = False
        
        # Dimensions
        self.bert_dim = 768
        self.svd_dim = svd_dim
        self.combined_dim = self.bert_dim + self.svd_dim
        
        # CNN layers for classification
        # 1D convolutions with different kernel sizes
        self.conv1 = nn.Conv1d(1, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(256, 128, kernel_size=3, padding=1)
        
        # Pooling
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Calculate the output size after convolutions and pooling
        # For combined_dim of 1068 (768 + 300):
        # After conv1: 1068 -> 1068 (same padding)
        # After pool: 1068 -> 534
        # After conv2: 534 -> 534 (same padding)
        # After pool: 534 -> 267
        # After conv3: 267 -> 267 (same padding)
        # After pool: 267 -> 133
        final_cnn_output_size = 128 * 133
        
        # Linear layers
        self.fc1 = nn.Linear(final_cnn_output_size, 256)
        self.fc2 = nn.Linear(256, num_labels)
        
    def forward(self, input_ids, attention_mask, svd_embeddings):
        # Get BERT embeddings for the CLS token
        with torch.no_grad():  # Don't compute gradients for BERT
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            bert_cls_emb = bert_outputs.last_hidden_state[:, 0, :]  # [batch_size, bert_dim]
        
        # Concatenate BERT CLS embedding with SVD embeddings
        combined_emb = torch.cat((bert_cls_emb, svd_embeddings), dim=1)  # [batch_size, combined_dim]
        
        # Reshape for CNN (batch_size, channels, sequence_length)
        combined_emb = combined_emb.unsqueeze(1)  # [batch_size, 1, combined_dim]
        
        # Apply CNN layers
        x = self.pool(torch.relu(self.conv1(combined_emb)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Apply linear layers
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        logits = self.fc2(x)
        
        return logits

# Create datasets and dataloaders for combined model
combined_train_dataset = CombinedDataset(train_texts, train_labels, train_svd_embeddings, tokenizer)
combined_val_dataset = CombinedDataset(val_texts, val_labels, val_svd_embeddings, tokenizer)
combined_train_loader = DataLoader(combined_train_dataset, batch_size=16, shuffle=True)
combined_val_loader = DataLoader(combined_val_dataset, batch_size=16, shuffle=False)

# Initialize the combined model
combined_model = BertSvdCnnModel(bert_base, num_labels, svd_dim=svd_dim)
combined_model.to(device)

# Optimizer for combined model
combined_optimizer = AdamW(combined_model.parameters(), lr=2e-5)
combined_criterion = nn.CrossEntropyLoss()

# Training function for combined model
def train_combined(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        svd_embeddings = batch["svd_embeddings"].to(device)
        labels = batch["labels"].to(device)
        
        logits = model(input_ids, attention_mask, svd_embeddings)
        loss = criterion(logits, labels)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    return total_loss / len(dataloader)

# Evaluation function for combined model
def evaluate_combined(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            svd_embeddings = batch["svd_embeddings"].to(device)
            labels = batch["labels"].to(device)
            
            logits = model(input_ids, attention_mask, svd_embeddings)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
            
    return accuracy_score(true_labels, predictions), predictions, true_labels

# Training loop for combined model
print("Step 3: Training the CNN-based combined model")
combined_epochs = 3
for epoch in range(combined_epochs):
    train_loss = train_combined(combined_model, combined_train_loader, combined_optimizer, combined_criterion, device)
    val_acc, _, _ = evaluate_combined(combined_model, combined_val_loader, device)
    print(f"Combined Model - Epoch {epoch+1}/{combined_epochs}, Loss: {train_loss:.4f}, Val Accuracy: {val_acc:.4f}")

# Save the combined model
torch.save(combined_model.state_dict(), "bert_svd_cnn_model.pt")

# Evaluate the combined model on the test set and generate detailed metrics
print("\nStep 4: Evaluating the combined model on the test set")
test_acc, test_predictions, test_true_labels = evaluate_combined(combined_model, combined_val_loader, device)
print(f"Test Accuracy: {test_acc:.4f}")

# Print detailed classification report
class_names = label_encoder.classes_
print("\nClassification Report:")
print(classification_report(test_true_labels, test_predictions, target_names=class_names))

# Convert predictions to original labels
test_pred_labels = label_encoder.inverse_transform(test_predictions)
test_true_orig_labels = label_encoder.inverse_transform(test_true_labels)

# Create a dataframe with test results
test_results = pd.DataFrame({
    'Text': [val_texts[i] for i in range(len(val_texts))],
    'True Label': test_true_orig_labels,
    'Predicted Label': test_pred_labels,
    'Correct': test_true_orig_labels == test_pred_labels
})

# Save test results to CSV
test_results.to_csv('test_results.csv', index=False)
print("Test results saved to 'test_results.csv'")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda




Step 1: Fine-tuning BERT for 3 epochs
BERT Fine-tuning - Epoch 1/3, Loss: 1.8581, Val Accuracy: 0.4459
BERT Fine-tuning - Epoch 2/3, Loss: 1.3827, Val Accuracy: 0.5366
BERT Fine-tuning - Epoch 3/3, Loss: 0.9850, Val Accuracy: 0.5355
Step 2: Creating SVD embeddings with dimension 300




Step 3: Training the CNN-based combined model
Combined Model - Epoch 1/3, Loss: 1.6608, Val Accuracy: 0.5232
Combined Model - Epoch 2/3, Loss: 0.7394, Val Accuracy: 0.5458
Combined Model - Epoch 3/3, Loss: 0.6102, Val Accuracy: 0.5427

Step 4: Evaluating the combined model on the test set
Test Accuracy: 0.5427

Classification Report:
                          precision    recall  f1-score   support

       Broad Gender Bias       0.63      0.43      0.51        72
   Dismissive Addressing       0.00      0.00      0.00        16
     Everyday Derogation       0.68      0.58      0.63       190
Fixed Gender Perceptions       0.52      0.56      0.54       118
     Harmful Provocation       0.67      0.65      0.66        69
          Hostile Speech       0.49      0.63      0.55       186
    Masked Disparagement       0.25      0.08      0.12        13
         Menacing Speech       0.25      0.07      0.11        14
    Singular Gender Bias       0.24      0.19      0.21        21
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


This lead to a good increase in the accuracy but we have to still go a long way