In [1]:
# this code works but does it really work?
!pip install transformers datasets torch scikit-learn pandas



In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load dataset
df = pd.read_csv("/kaggle/input/sexism-classification-dataset-csv/sexism_classification_dataset.csv")

# Encode labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label_vector"])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label_id"].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
tfidf_vectorizer.fit(train_texts)
train_tfidf = tfidf_vectorizer.transform(train_texts).toarray()
val_tfidf = tfidf_vectorizer.transform(val_texts).toarray()

# Dataset class for BERT + TF-IDF
class CombinedDataset(Dataset):
    def __init__(self, texts, tfidf_vectors, labels, tokenizer, max_len=128):
        self.texts = texts
        self.tfidf_vectors = tfidf_vectors
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "tfidf_vector": torch.tensor(self.tfidf_vectors[idx], dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create dataset objects
train_dataset = CombinedDataset(train_texts, train_tfidf, train_labels, tokenizer)
val_dataset = CombinedDataset(val_texts, val_tfidf, val_labels, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Define the combined model
class CombinedModel(nn.Module):
    def __init__(self, bert_model, tfidf_dim, hidden_dim, num_classes):
        super(CombinedModel, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        
        # Combined dimensions: BERT (768) + TF-IDF dimensions
        combined_dim = 768 + tfidf_dim
        
        # Feed-forward layers
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, tfidf_vector):
        # Get BERT embeddings (last hidden state)
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the [CLS] token representation
        bert_embeddings = bert_output.last_hidden_state[:, 0, :]
        
        # Concatenate BERT embeddings with TF-IDF vector
        combined_features = torch.cat((bert_embeddings, tfidf_vector), dim=1)
        combined_features = self.dropout(combined_features)
        
        # Pass through classifier
        logits = self.classifier(combined_features)
        return logits

# Initialize BERT model
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Initialize the combined model
tfidf_dim = train_tfidf.shape[1]  # Dimension of TF-IDF vectors
hidden_dim = 512
num_labels = len(label_encoder.classes_)
model = CombinedModel(bert_model, tfidf_dim, hidden_dim, num_labels)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Phase 1: Fine-tune BERT first
def train_bert_only(model, dataloader, optimizer, criterion, device, bert_only=True):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        tfidf_vector = batch["tfidf_vector"].to(device)
        labels = batch["labels"].to(device)
        
        # If bert_only is True, only update BERT parameters
        if bert_only:
            # Freeze the non-BERT parameters
            for name, param in model.named_parameters():
                if "bert" not in name:
                    param.requires_grad = False
        else:
            # Unfreeze all parameters
            for param in model.parameters():
                param.requires_grad = True
        
        # Forward pass
        logits = model(input_ids, attention_mask, tfidf_vector)
        loss = criterion(logits, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

# Phase 2: Train the combined model with full parameters
def train_combined(model, dataloader, optimizer, criterion, device):
    return train_bert_only(model, dataloader, optimizer, criterion, device, bert_only=False)

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            tfidf_vector = batch["tfidf_vector"].to(device)
            labels = batch["labels"].to(device)
            
            logits = model(input_ids, attention_mask, tfidf_vector)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, predictions)
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    weighted_f1 = f1_score(true_labels, predictions, average='weighted')
    
    return {
        'accuracy': acc,
        'macro_f1': macro_f1,
        'weighted_f1': weighted_f1,
        'predictions': predictions,
        'true_labels': true_labels
    }


    
    bert_dataset = TextDataset(texts, labels, tokenizer)
    bert_loader = DataLoader(bert_dataset, batch_size=16, shuffle=True)
    
    # Optimizer
    bert_optimizer = AdamW(bert_only_model.parameters(), lr=2e-5)
    
    # Training function for BERT-only
    def train_bert(model, dataloader, optimizer, device):
        model.train()
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        return total_loss / len(dataloader)
    
    # Evaluation function for BERT-only
    def eval_bert(model, dataloader, device):
        model.eval()
        predictions, true_labels = [], []
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                
                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy())
        
        acc = accuracy_score(true_labels, predictions)
        macro_f1 = f1_score(true_labels, predictions, average='macro')
        weighted_f1 = f1_score(true_labels, predictions, average='weighted')
        
        return {
            'accuracy': acc,
            'macro_f1': macro_f1,
            'weighted_f1': weighted_f1
        }
    
    # Train BERT-only for 3 epochs
    print("\nTraining BERT-only model for comparison...")
    for epoch in range(3):
        train_loss = train_bert(bert_only_model, bert_loader, bert_optimizer, device)
        eval_metrics = eval_bert(bert_only_model, bert_loader, device)
        print(f"Epoch {epoch+1}/3, Loss: {train_loss:.4f}, Accuracy: {eval_metrics['accuracy']:.4f}, Macro F1: {eval_metrics['macro_f1']:.4f}")
    
    # Final evaluation
    final_metrics = eval_bert(bert_only_model, bert_loader, device)
    return final_metrics

# Training loop - Phase 1: Fine-tune BERT
print("Phase 1: Fine-tuning BERT...")
bert_epochs = 3
for epoch in range(bert_epochs):
    train_loss = train_bert_only(model, train_loader, optimizer, criterion, device)
    val_metrics = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}/{bert_epochs}, Loss: {train_loss:.4f}, Val Accuracy: {val_metrics['accuracy']:.4f}, Macro F1: {val_metrics['macro_f1']:.4f}")

# Training loop - Phase 2: Train the combined model with increased epochs
print("\nPhase 2: Training the combined model...")
combined_epochs = 5  # Increased from 3 to 5
for epoch in range(combined_epochs):
    train_loss = train_combined(model, train_loader, optimizer, criterion, device)
    val_metrics = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}/{combined_epochs}, Loss: {train_loss:.4f}, Val Accuracy: {val_metrics['accuracy']:.4f}, Macro F1: {val_metrics['macro_f1']:.4f}")

# Final evaluation on validation data
print("\nFinal Evaluation of Combined Model on Validation Data:")
final_metrics = evaluate(model, val_loader, device)
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"Macro F1 Score: {final_metrics['macro_f1']:.4f}")
print(f"Weighted F1 Score: {final_metrics['weighted_f1']:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(
    final_metrics['true_labels'], 
    final_metrics['predictions'], 
    target_names=label_encoder.classes_
))


print("\nCombined Model Final Metrics:")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"Macro F1 Score: {final_metrics['macro_f1']:.4f}")
print(f"Weighted F1 Score: {final_metrics['weighted_f1']:.4f}")


# Save the model
torch.save(model.state_dict(), "combined_bert_tfidf_model.pt")
print("\nModel saved as 'combined_bert_tfidf_model.pt'")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using device: cuda




Phase 1: Fine-tuning BERT...
Epoch 1/3, Loss: 1.9689, Val Accuracy: 0.4346, Macro F1: 0.1905
Epoch 2/3, Loss: 1.6497, Val Accuracy: 0.4902, Macro F1: 0.2589
Epoch 3/3, Loss: 1.4203, Val Accuracy: 0.4943, Macro F1: 0.2934

Phase 2: Training the combined model...
Epoch 1/5, Loss: 0.9913, Val Accuracy: 0.5180, Macro F1: 0.3607
Epoch 2/5, Loss: 0.5931, Val Accuracy: 0.5324, Macro F1: 0.3941
Epoch 3/5, Loss: 0.3651, Val Accuracy: 0.5057, Macro F1: 0.3891
Epoch 4/5, Loss: 0.2541, Val Accuracy: 0.5067, Macro F1: 0.3721
Epoch 5/5, Loss: 0.1833, Val Accuracy: 0.5139, Macro F1: 0.4061

Final Evaluation of Combined Model on Validation Data:
Accuracy: 0.5139
Macro F1 Score: 0.4061
Weighted F1 Score: 0.5052

Classification Report:
                          precision    recall  f1-score   support

       Broad Gender Bias       0.75      0.25      0.38        72
   Dismissive Addressing       0.00      0.00      0.00        16
     Everyday Derogation       0.60      0.63      0.61       190
Fixed G

Here we used a ffnn architecture which is trained on embeddings from both BERT and tf-idf with an input dimesnion of 768 + (dimsensions of tf-idf) 
But this architecture reduces our accuracy so now we apply pca reduction technique on these tf-idf embeddings then train a more layer ffnn based architecture 
notebook name- tf-idf+BERT+nn_v2.ipynb

But for comparison and exploring purposes lets use 
co-occ matrix with svd reduction on a CNN based model 
we have seen that cnn is also used in text classification (as it was in the 2014 Kim Et all paper here is the link: https://arxiv.org/pdf/1408.5882)
notebook name- coocc-svd-bert-cnn.ipynb


