In [1]:

!pip install transformers datasets torch scikit-learn pandas



As in the previous notebook we tried to reduce overfitting through- regularisation and k-fold cross validation 
Here in this code we try the dimensionality reduction technique to remove overfitting.
we particularly use PCA in here because this has proved to be better than svd because as it saves variance in the reduced dimensions

In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/kaggle/input/sexism-classification-dataset-csv/sexism_classification_dataset.csv")

# Encode labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label_vector"])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label_id"].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create Dataloaders
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load BERT model for fine-tuning
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Fine-tune BERT
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def extract_bert_embeddings(model, dataloader, device):
    model.eval()
    embeddings, labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, label = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["labels"]
            outputs = model.bert(input_ids, attention_mask=attention_mask)
            hidden_states = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Mean pooling
            embeddings.extend(hidden_states)
            labels.extend(label.numpy())
    return embeddings, labels

# Train BERT
epochs = 3
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}")

# Extract BERT embeddings for train and val sets
train_embeddings, train_labels = extract_bert_embeddings(model, train_loader, device)
val_embeddings, val_labels = extract_bert_embeddings(model, val_loader, device)

# Apply PCA
scaler = StandardScaler()
pca = PCA(n_components=300)  # Dimensionality reduction
train_embeddings = pca.fit_transform(scaler.fit_transform(train_embeddings))
val_embeddings = pca.transform(scaler.transform(val_embeddings))

# Define simple neural classifier
class NeuralClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(NeuralClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

# Convert numpy arrays to tensors
train_embeddings = torch.tensor(train_embeddings, dtype=torch.float32)
val_embeddings = torch.tensor(val_embeddings, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)
val_labels = torch.tensor(val_labels, dtype=torch.long)

# Create DataLoader for Neural Classifier
train_data = torch.utils.data.TensorDataset(train_embeddings, train_labels)
val_data = torch.utils.data.TensorDataset(val_embeddings, val_labels)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16, shuffle=False)

# Train the Neural Classifier
input_dim = 300  # Same as PCA output
dnn_model = NeuralClassifier(input_dim, num_labels).to(device)
optimizer = optim.Adam(dnn_model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

def train_nn(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        inputs, labels = batch[0].to(device), batch[1].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_nn(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch[0].to(device), batch[1].to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

# Train DNN Classifier
for epoch in range(5):
    train_loss = train_nn(dnn_model, train_loader, optimizer, criterion, device)
    val_acc = evaluate_nn(dnn_model, val_loader, device)
    print(f"Epoch {epoch+1}/5, Loss: {train_loss:.4f}, Val Accuracy: {val_acc:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 1.8520
Epoch 2/3, Loss: 1.3665
Epoch 3/3, Loss: 0.9752
Epoch 1/5, Loss: 1.7889, Val Accuracy: 0.5294
Epoch 2/5, Loss: 1.7030, Val Accuracy: 0.5273
Epoch 3/5, Loss: 1.6893, Val Accuracy: 0.5242
Epoch 4/5, Loss: 1.6742, Val Accuracy: 0.5221
Epoch 5/5, Loss: 1.6664, Val Accuracy: 0.5129


I did not think that 300 dimensions were capturing enough information so what I did was use the variance component of PCA 
format in which we set the dimensions to include 98% variance 
Lets see if that help us in getting better accuracy

In [6]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/kaggle/input/sexism-classification-dataset-csv/sexism_classification_dataset.csv")

# Encode labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label_vector"])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label_id"].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Load BERT model
bert_model = BertModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

# Extract BERT embeddings
def extract_embeddings(texts, tokenizer, model, device):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in texts:
            encoded = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
            input_ids = encoded['input_ids'].to(device)
            attention_mask = encoded['attention_mask'].to(device)
            output = model(input_ids, attention_mask=attention_mask)
            embeddings.append(output.last_hidden_state[:, 0, :].cpu().numpy())
    return torch.tensor(embeddings).squeeze()

train_embeddings = extract_embeddings(train_texts, tokenizer, bert_model, device)
val_embeddings = extract_embeddings(val_texts, tokenizer, bert_model, device)

# Apply PCA to retain 98% variance
pca = PCA(n_components=0.98)
train_pca = pca.fit_transform(train_embeddings)
val_pca = pca.transform(val_embeddings)

# Convert to torch tensors
train_pca = torch.tensor(train_pca, dtype=torch.float32)
val_pca = torch.tensor(val_pca, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)
val_labels = torch.tensor(val_labels, dtype=torch.long)

# Custom Dataset for NN class
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

train_dataset = EmbeddingDataset(train_pca, train_labels)
val_dataset = EmbeddingDataset(val_pca, val_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define deeper NN model
class DeepNeuralNet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(DeepNeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Model setup
input_dim = train_pca.shape[1]
num_classes = len(label_encoder.classes_)
model = DeepNeuralNet(input_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training function
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

# Training loop
epochs = 10
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    val_acc = evaluate_model(model, val_loader, device)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, Val Accuracy: {val_acc:.4f}")


  return torch.tensor(embeddings).squeeze()


Epoch 1/10, Loss: 2.2170, Val Accuracy: 0.3223
Epoch 2/10, Loss: 1.8979, Val Accuracy: 0.3357
Epoch 3/10, Loss: 1.7745, Val Accuracy: 0.3574
Epoch 4/10, Loss: 1.6664, Val Accuracy: 0.3656
Epoch 5/10, Loss: 1.5936, Val Accuracy: 0.3656
Epoch 6/10, Loss: 1.5240, Val Accuracy: 0.3800
Epoch 7/10, Loss: 1.4583, Val Accuracy: 0.3893
Epoch 8/10, Loss: 1.3869, Val Accuracy: 0.3852
Epoch 9/10, Loss: 1.3235, Val Accuracy: 0.4006
Epoch 10/10, Loss: 1.2642, Val Accuracy: 0.3986


But this certainly not helped and we saw a very dramatic change in the accuracy
So what do we do now? So I tried to implement a Roberta Model instead of Bert to get better accuracy as:
1) It is trained on 10 times larger datset than BERT
2) When the roberta model was trained in that, the researchers used masking techniques at training time that means whenever the sentence is taken for training we mask random words which helps it in generalising for not only next word prediction tasks but others as well
3) On the otherhand bert is trained in such a way that it is efficient for only next word prediction because of non-dynamic masking. 