In [None]:
pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transfor

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch import cuda
import torch
from transformers import DistilBertConfig

# Load and preprocess the data
data = pd.read_csv('/content/edos_labelled_aggregated.csv')
labels = data['label_sexist'].values
categories = data['label_category'].values
vectors = data['label_vector'].values
texts = data['text'].values

# Load and preprocess the data
#data = pd.read_csv('/content/edos_labelled_aggregated.csv')
#train_labels = data['label_sexist'].values
#train_categories = data['label_category'].values
#train_vectors = data['label_vector'].values
#train_texts = data['text'].values

# Load and preprocess the data
#test = pd.read_csv('/content/test.csv')
#test_labels = data['label_sexist'].values
#test_categories = data['label_category'].values
#test_vectors = data['label_vector'].values
#test_texts = data['text'].values

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels, train_categories, test_categories, train_vectors, test_vectors = train_test_split(
    texts, labels, categories, vectors, test_size=0.2, random_state=42
)

# Define the label mapping
label_mapping = {
    'sexist': 1,
    'not sexist': 0
}

category_mapping = {
    'none': 0,
    '1. threats, plans to harm and incitement': 1,
    '2. derogation': 2,
    '3. animosity': 3,
    '4. prejudiced discussions': 4

    # Add more categories as necessary
}

vector_mapping = {
    'none': 0,
    '1.1 threats of harm': 1,
    '1.2 incitement and encouragement of harm': 2,
    '2.1 descriptive attacks': 3,
    '2.2 aggressive and emotive attacks': 4,
    '2.3 dehumanising attacks & overt sexual objectification': 5,
    '3.1 casual use of gendered slurs, profanities, and insults': 6,
    '3.2 immutable gender differences and gender stereotypes': 7,
    '3.3 backhanded gendered compliments': 8,
    '3.4 condescending explanations or unwelcome advice': 9,
    '4.1 supporting mistreatment of individual women': 10,
    '4.2 supporting systemic discrimination against women as a group': 11
    

    # Add more categories as necessary
}

In [None]:
# Define the custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, categories, vectors, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.categories = categories
        self.vectors = vectors
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        category = self.categories[idx]
        vector = self.vectors[idx]
    
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt'
        )
    
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label_mapping[label]),  # Encode label as numerical value
            'category': torch.tensor(category_mapping[category]),  # Encode category as numerical value
            'vector': torch.tensor(vector_mapping[vector])
        }

In [None]:
# Set device
device = 'cuda' if cuda.is_available() else 'cpu'

# Set hyperparameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 2e-5

from transformers import DistilBertForSequenceClassification, DistilBertModel
import torch.nn as nn

class CustomDistilBertForSequenceClassification(DistilBertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.distilbert = DistilBertModel(config)
        self.dropout = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.hidden_size + 1 + 1 + 1 , config.num_labels)  # Include 1 additional unit for each extra feature
    
    def forward(self, input_ids=None, attention_mask=None, category=None, labels=None, vector=None, **kwargs):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        hidden_state = distilbert_output.last_hidden_state[:, 0, :]  # Extract the [CLS] token embedding
        hidden_state = self.dropout(hidden_state)
    
        # Reshape the category tensor to match the dimensions of the hidden_state tensor
        if category is not None:
            category = category.unsqueeze(1)  # Add an extra dimension
        
        # Reshape the labels tensor to match the dimensions of the hidden_state tensor
        if labels is not None:
            labels = labels.unsqueeze(1)  # Add an extra dimension
    
        # Reshape the vector tensor to match the dimensions of the hidden_state tensor
        if vector is not None:
            vector = vector.unsqueeze(1)  # Add an extra dimension
    
        # Concatenate the hidden state with the extra features
        if category is not None:
            hidden_state = torch.cat((hidden_state, category), dim=1)
        if labels is not None:
            hidden_state = torch.cat((hidden_state, labels), dim=1)
        if vector is not None:
            hidden_state = torch.cat((hidden_state, vector), dim=1)
    
        logits = self.classifier(hidden_state)
        outputs = (logits,) + distilbert_output[1:]  # Add hidden states and attention if they are present
    
        return outputs


In [None]:
# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=2)
model = CustomDistilBertForSequenceClassification(config)

# Move model to device
model = model.to(device)



# Create data loaders
train_dataset = TextDataset(train_texts, train_labels, train_categories, train_vectors, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = TextDataset(test_texts, test_labels, test_categories, test_vectors, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Set optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        categories = batch['category'].to(device)
        vectors = batch['vector'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            category=categories,
            vector=vectors
        )
        
        logits = outputs[0]
        loss = criterion(logits, labels)  # Calculate the loss
        
               
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{EPOCHS}, Average Loss: {avg_loss:.4f}')
    
    # Evaluation on the test set
    model.eval()
    total_loss = 0
    test_loss = 0
    predictions = []
    true_labels = []
    true_categories = []
    true_vectors = []
    predicted_categories = []
    predicted_labels = []
    category_scores = {}  # Dictionary to store accuracy for each category
    label_scores = {}
    vector_scores = {}
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            category = batch['category'].to(device)
            vector = batch['vector'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                category=category,
                vector=vector
            )
            
            logits = outputs[0]
            softmax_probs = torch.softmax(logits, dim=1)
            predicted_labels_batch = torch.argmax(softmax_probs, dim=1)
        
            predictions.extend(predicted_labels_batch.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            
            predicted_labels.extend(predicted_labels_batch.cpu().numpy())
            true_vectors.extend(vectors.cpu().numpy())

            true_categories.extend(categories.cpu().numpy())
            predicted_categories.extend(predicted_labels_batch.cpu().numpy())
            
            
            # Calculate precision, recall, and F1-score for each label
            for i in range(len(predicted_labels_batch)):
                true_label = labels[i].item()
                predicted_label = predicted_labels_batch[i].item()
                sexist_label = test_labels[i]
            
                if sexist_label not in label_scores:
                    label_scores[sexist_label] = {'true_positive': 0, 'false_positive': 0, 'false_negative': 0, 'true_negative': 0}
            
                if true_label == predicted_label == 1:
                    label_scores[sexist_label]['true_positive'] += 1
                elif true_label == 0 and predicted_label == 1:
                    label_scores[sexist_label]['false_positive'] += 1
                elif true_label == 1 and predicted_label == 0:
                    label_scores[sexist_label]['false_negative'] += 1
                elif true_label == 0 and predicted_label == 0:
                    label_scores[sexist_label]['true_negative'] += 1
         
    for sexist_label, scores in label_scores.items():
        true_positive = scores['true_positive']
        false_positive = scores['false_positive']
        false_negative = scores['false_negative']
        true_negative = scores['true_negative']
    
        label_accuracy = (true_positive + true_negative)/(true_positive + true_negative + false_positive + false_negative)
        label_precision = true_positive / (true_positive + false_positive + 1e-10)
        label_recall = true_positive / (true_positive + false_negative + 1e-10)
        label_f1 = 2 * (label_precision * label_recall) / (label_precision + label_recall + 1e-10)
    
    print('Overall acore')
    print(f'Accuracy: {label_accuracy:.4f}')
    print(f'Precision: {label_precision:.4f}')
    print(f'Recall: {label_recall:.4f}')
    print(f'F1-score: {label_f1:.4f}')
    print('---')

    avg_test_loss = test_loss / len(test_loader)
      
    # Calculate cumulative scores
    print(f'Test Loss: {avg_test_loss}')

    
    # Calculate cumulative scores for vectors, labels, and categories
    #cumulative_true_positive = sum([score['true_positive'] for score in category_scores.values()])
    #cumulative_false_positive = sum([score['false_positive'] for score in category_scores.values()])
    #cumulative_false_negative = sum([score['false_negative'] for score in category_scores.values()])

    #cumulative_precision = cumulative_true_positive / (cumulative_true_positive + cumulative_false_positive + 1e-7)
    #cumulative_recall = cumulative_true_positive / (cumulative_true_positive + cumulative_false_negative + 1e-7)
    #cumulative_accuracy = (cumulative_true_positive + len(test_loader) - cumulative_false_positive - cumulative_false_negative) / len(test_loader)

    vector_accuracy = accuracy_score(true_vectors, predicted_labels)
    vector_precision = precision_score(true_vectors, predicted_labels, average='micro')
    vector_recall = recall_score(true_vectors, predicted_labels, average='micro')
    vector_f = f1_score(true_vectors, predicted_labels, average='micro')

    label_accuracy = accuracy_score(true_labels, predictions)
    label_precision = precision_score(true_labels, predictions, average='micro')
    label_recall = recall_score(true_labels, predictions, average='micro')
    label_f = f1_score(true_labels, predictions, average='micro')

    category_accuracy = accuracy_score(true_categories, predicted_categories)
    category_precision = precision_score(true_categories, predicted_categories, average='micro')
    category_recall = recall_score(true_categories, predicted_categories, average='micro')
    category_f = f1_score(true_categories, predicted_categories, average='micro')

   

    print("\nScores for Vectors:")
    print(f"Accuracy: {vector_accuracy:.4f}")
    print(f"Precision: {vector_precision:.4f}")
    print(f"Recall: {vector_recall:.4f}")
    print(f"F1: {vector_f:.4f}")

    print("\nScores for Labels:")
    print(f"Accuracy: {label_accuracy:.4f}")
    print(f"Precision: {label_precision:.4f}")
    print(f"Recall: {label_recall:.4f}")
    print(f"F1: {label_f:.4f}")

    print("\nScores for Categories:")
    print(f"Accuracy: {category_accuracy:.4f}")
    print(f"Precision: {category_precision:.4f}")
    print(f"Recall: {category_recall:.4f}")
    print(f"F1: {category_f:.4f}")

Epoch 1/5, Average Loss: 0.4462
Overall acore
Accuracy: 0.8413
Precision: 0.8053
Recall: 0.4840
F1-score: 0.6047
---
Test Loss: 0.0

Scores for Vectors:
Accuracy: 0.4885
Precision: 0.4885
Recall: 0.4885
F1: 0.4885

Scores for Labels:
Accuracy: 0.8458
Precision: 0.8458
Recall: 0.8458
F1: 0.8458

Scores for Categories:
Accuracy: 0.4973
Precision: 0.4973
Recall: 0.4973
F1: 0.4973
Epoch 2/5, Average Loss: 0.3119
Overall acore
Accuracy: 0.8573
Precision: 0.7700
Recall: 0.6144
F1-score: 0.6834
---
Test Loss: 0.0

Scores for Vectors:
Accuracy: 0.6068
Precision: 0.6068
Recall: 0.6068
F1: 0.6068

Scores for Labels:
Accuracy: 0.8560
Precision: 0.8560
Recall: 0.8560
F1: 0.8560

Scores for Categories:
Accuracy: 0.6068
Precision: 0.6068
Recall: 0.6068
F1: 0.6068
Epoch 3/5, Average Loss: 0.2369
Overall acore
Accuracy: 0.8553
Precision: 0.7751
Recall: 0.5957
F1-score: 0.6737
---
Test Loss: 0.0

Scores for Vectors:
Accuracy: 0.6635
Precision: 0.6635
Recall: 0.6635
F1: 0.6635

Scores for Labels:
Accura