In [1]:
# this code works but does it really work?
!pip install transformers datasets torch scikit-learn pandas



In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Load dataset
df = pd.read_csv("/kaggle/input/sexism-classification-dataset-csv/sexism_classification_dataset.csv")

# Encode labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label_vector"])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label_id"].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create TF-IDF vectors with more features
print("Creating TF-IDF vectors...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=3000,  # Increased from 1000 to 3000
    min_df=2,           # Minimum document frequency
    ngram_range=(1, 2)  # Include both unigrams and bigrams
)
tfidf_vectorizer.fit(train_texts)
train_tfidf = tfidf_vectorizer.transform(train_texts).toarray()
val_tfidf = tfidf_vectorizer.transform(val_texts).toarray()

print(f"Original TF-IDF features shape: {train_tfidf.shape}")

# Apply PCA to reduce dimensionality while preserving 95% variance
print("Applying PCA...")
pca = PCA(n_components=0.95)  # Keep 95% of variance
pca.fit(train_tfidf)
train_tfidf_pca = pca.transform(train_tfidf)
val_tfidf_pca = pca.transform(val_tfidf)

print(f"After PCA - features reduced to {train_tfidf_pca.shape[1]} components")
print(f"Variance explained: {sum(pca.explained_variance_ratio_):.4f}")

# Dataset class for BERT + TF-IDF-PCA
class CombinedDataset(Dataset):
    def __init__(self, texts, tfidf_vectors, labels, tokenizer, max_len=128):
        self.texts = texts
        self.tfidf_vectors = tfidf_vectors
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "tfidf_vector": torch.tensor(self.tfidf_vectors[idx], dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create dataset objects
train_dataset = CombinedDataset(train_texts, train_tfidf_pca, train_labels, tokenizer)
val_dataset = CombinedDataset(val_texts, val_tfidf_pca, val_labels, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Define the combined model
class CombinedModel(nn.Module):
    def __init__(self, bert_model, tfidf_dim, hidden_dim, num_classes):
        super(CombinedModel, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        
        # Combined dimensions: BERT (768) + TF-IDF-PCA dimensions
        combined_dim = 768 + tfidf_dim
        
        # Feed-forward layers with an additional hidden layer for better representation
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim // 2, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, tfidf_vector):
        # Get BERT embeddings (last hidden state)
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token representation
        bert_embeddings = bert_output.last_hidden_state[:, 0, :]
        
        # Concatenate BERT embeddings with TF-IDF-PCA vector
        combined_features = torch.cat((bert_embeddings, tfidf_vector), dim=1)
        combined_features = self.dropout(combined_features)
        
        # Pass through classifier
        logits = self.classifier(combined_features)
        return logits

# Initialize BERT model
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Initialize the combined model
tfidf_dim = train_tfidf_pca.shape[1]  # Dimension of PCA-reduced TF-IDF vectors
hidden_dim = 512
num_labels = len(label_encoder.classes_)
model = CombinedModel(bert_model, tfidf_dim, hidden_dim, num_labels)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Phase 1: Fine-tune BERT first
def train_bert_only(model, dataloader, optimizer, criterion, device, bert_only=True):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        tfidf_vector = batch["tfidf_vector"].to(device)
        labels = batch["labels"].to(device)
        
        # If bert_only is True, only update BERT parameters
        if bert_only:
            # Freeze the non-BERT parameters
            for name, param in model.named_parameters():
                if "bert" not in name:
                    param.requires_grad = False
        else:
            # Unfreeze all parameters
            for param in model.parameters():
                param.requires_grad = True
        
        # Forward pass
        logits = model(input_ids, attention_mask, tfidf_vector)
        loss = criterion(logits, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

# Phase 2: Train the combined model with full parameters
def train_combined(model, dataloader, optimizer, criterion, device):
    return train_bert_only(model, dataloader, optimizer, criterion, device, bert_only=False)

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            tfidf_vector = batch["tfidf_vector"].to(device)
            labels = batch["labels"].to(device)
            
            logits = model(input_ids, attention_mask, tfidf_vector)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, predictions)
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    weighted_f1 = f1_score(true_labels, predictions, average='weighted')
    
    return {
        'accuracy': acc,
        'macro_f1': macro_f1,
        'weighted_f1': weighted_f1,
        'predictions': predictions,
        'true_labels': true_labels
    }

# For comparison, implement BERT-only model evaluation
def evaluate_bert_only(texts, labels, device):
    # Create a standard BERT model
    bert_only_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
    bert_only_model.to(device)
    
    # Create dataset and dataloader
    class TextDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_len=128):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_len = max_len

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            encoding = self.tokenizer(
                self.texts[idx],
                truncation=True,
                padding='max_length',
                max_length=self.max_len,
                return_tensors='pt'
            )
            return {
                "input_ids": encoding["input_ids"].squeeze(0),
                "attention_mask": encoding["attention_mask"].squeeze(0),
                "labels": torch.tensor(self.labels[idx], dtype=torch.long)
            }
    
    bert_dataset = TextDataset(texts, labels, tokenizer)
    bert_loader = DataLoader(bert_dataset, batch_size=16, shuffle=True)
    
    # Optimizer
    bert_optimizer = AdamW(bert_only_model.parameters(), lr=2e-5)
    
    # Training function for BERT-only
    def train_bert(model, dataloader, optimizer, device):
        model.train()
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        return total_loss / len(dataloader)
    
    # Evaluation function for BERT-only
    def eval_bert(model, dataloader, device):
        model.eval()
        predictions, true_labels = [], []
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                
                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy())
        
        acc = accuracy_score(true_labels, predictions)
        macro_f1 = f1_score(true_labels, predictions, average='macro')
        weighted_f1 = f1_score(true_labels, predictions, average='weighted')
        
        return {
            'accuracy': acc,
            'macro_f1': macro_f1,
            'weighted_f1': weighted_f1
        }
    
    # Train BERT-only for 3 epochs
    print("\nTraining BERT-only model for comparison...")
    for epoch in range(3):
        train_loss = train_bert(bert_only_model, bert_loader, bert_optimizer, device)
        eval_metrics = eval_bert(bert_only_model, bert_loader, device)
        print(f"Epoch {epoch+1}/3, Loss: {train_loss:.4f}, Accuracy: {eval_metrics['accuracy']:.4f}, Macro F1: {eval_metrics['macro_f1']:.4f}")
    
    # Final evaluation
    final_metrics = eval_bert(bert_only_model, bert_loader, device)
    return final_metrics

# Training loop - Phase 1: Fine-tune BERT
print("Phase 1: Fine-tuning BERT...")
bert_epochs = 3
for epoch in range(bert_epochs):
    train_loss = train_bert_only(model, train_loader, optimizer, criterion, device)
    val_metrics = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}/{bert_epochs}, Loss: {train_loss:.4f}, Val Accuracy: {val_metrics['accuracy']:.4f}, Macro F1: {val_metrics['macro_f1']:.4f}")

# Training loop - Phase 2: Train the combined model with increased epochs
print("\nPhase 2: Training the combined model...")
combined_epochs = 5  # Increased from 3 to 5
for epoch in range(combined_epochs):
    train_loss = train_combined(model, train_loader, optimizer, criterion, device)
    val_metrics = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}/{combined_epochs}, Loss: {train_loss:.4f}, Val Accuracy: {val_metrics['accuracy']:.4f}, Macro F1: {val_metrics['macro_f1']:.4f}")

# Final evaluation on validation data
print("\nFinal Evaluation of Combined Model on Validation Data:")
final_metrics = evaluate(model, val_loader, device)
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"Macro F1 Score: {final_metrics['macro_f1']:.4f}")
print(f"Weighted F1 Score: {final_metrics['weighted_f1']:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(
    final_metrics['true_labels'], 
    final_metrics['predictions'], 
    target_names=label_encoder.classes_
))

# Compare with BERT-only approach
# print("\nComparing with BERT-only approach:")
# bert_only_metrics = evaluate_bert_only(train_texts, train_labels, device)
# print("\nBERT-only Final Metrics:")
# print(f"Accuracy: {bert_only_metrics['accuracy']:.4f}")
# print(f"Macro F1 Score: {bert_only_metrics['macro_f1']:.4f}")
# print(f"Weighted F1 Score: {bert_only_metrics['weighted_f1']:.4f}")

print("\nCombined Model Final Metrics:")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"Macro F1 Score: {final_metrics['macro_f1']:.4f}")
print(f"Weighted F1 Score: {final_metrics['weighted_f1']:.4f}")

if final_metrics['accuracy'] > bert_only_metrics['accuracy']:
    improvement = ((final_metrics['accuracy'] - bert_only_metrics['accuracy']) / bert_only_metrics['accuracy']) * 100
    print(f"\nCombined model improves accuracy by {improvement:.2f}%")
else:
    decrease = ((bert_only_metrics['accuracy'] - final_metrics['accuracy']) / bert_only_metrics['accuracy']) * 100
    print(f"\nCombined model decreases accuracy by {decrease:.2f}%")

# Feature importance analysis
print("\nFeature Importance Analysis:")
print(f"BERT embedding dimension: 768")
print(f"TF-IDF PCA dimensions: {tfidf_dim}")
print(f"Total feature dimensions: {768 + tfidf_dim}")
print(f"Top 10 PCA components explain {sum(pca.explained_variance_ratio_[:10])*100:.2f}% of variance")

# Save the model and preprocessing components
torch.save({
    'model_state_dict': model.state_dict(),
    'tfidf_vectorizer': tfidf_vectorizer,
    'pca': pca,
    'label_encoder': label_encoder
}, "combined_bert_tfidf_pca_model.pt")
print("\nModel and preprocessing components saved as 'combined_bert_tfidf_pca_model.pt'")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Creating TF-IDF vectors...
Original TF-IDF features shape: (3883, 3000)
Applying PCA...
After PCA - features reduced to 1780 components
Variance explained: 0.9501


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using device: cuda




Phase 1: Fine-tuning BERT...
Epoch 1/3, Loss: 2.2536, Val Accuracy: 0.3903, Macro F1: 0.1336
Epoch 2/3, Loss: 2.1265, Val Accuracy: 0.4408, Macro F1: 0.1898
Epoch 3/3, Loss: 2.0589, Val Accuracy: 0.4552, Macro F1: 0.2058

Phase 2: Training the combined model...
Epoch 1/5, Loss: 1.5614, Val Accuracy: 0.4809, Macro F1: 0.2507
Epoch 2/5, Loss: 1.0858, Val Accuracy: 0.5180, Macro F1: 0.3283
Epoch 3/5, Loss: 0.8069, Val Accuracy: 0.5314, Macro F1: 0.3430
Epoch 4/5, Loss: 0.5574, Val Accuracy: 0.5046, Macro F1: 0.3638
Epoch 5/5, Loss: 0.3881, Val Accuracy: 0.5160, Macro F1: 0.3727

Final Evaluation of Combined Model on Validation Data:
Accuracy: 0.5160
Macro F1 Score: 0.3727
Weighted F1 Score: 0.5116

Classification Report:
                          precision    recall  f1-score   support

       Broad Gender Bias       0.51      0.54      0.52        72
   Dismissive Addressing       0.00      0.00      0.00        16
     Everyday Derogation       0.63      0.56      0.59       190
Fixed G

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


NameError: name 'bert_only_metrics' is not defined

Here we did not compare our model with Bert_only metrics so that is the reason of error I have commented out the code for that.