# This notebook compares the test set results of the Vidgen et al. (2021) model against the three recliamed hate speech models.

# Evaluation of RHS Model with No Cross-Validation on Test Dataset

Importing the dataset

In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv('Notebook_11_Finetune_testset.csv')

In [None]:
dataset

Unnamed: 0,sentences,hate_label,gold_label,dataset,bitch,drag,fag,gay,homo,other,puss,queen,queer,sissy,slay,slut,whore,RHS Model Prediction,Vidgen Model Prediction
0,BbBB...b.b..b.bb but OP's a faggot,1,DEG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
1,"it's a fucking crab, nobody cares faggot",1,DEG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
2,"&gt;Tasteless ""politically incorrect"" dark, of...",1,DEG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,Mass tagger seems pretty useful for knowing wh...,1,DEG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
4,Wild! Faggot was my nickname in high school! W...,0,NDG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2534,the devils been sending his workers to get at ...,1,DEG,kaggle,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
2535,the jews gon be laughin at a nigga on Judgemen...,1,DEG,kaggle,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2536,"this fat bitch was preaching like ""we have too...",1,DEG,kaggle,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2537,those random kids who sit at the plaza all day...,1,DEG,kaggle,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1


In [None]:
!pip install transformers



Importing and initializing the model and tokenizer

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

# 1. Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("EZiisk/NEW_EZ_finetune_Vidgen_model_reclaimed_language")
tokenizer = AutoTokenizer.from_pretrained("EZiisk/NEW_EZ_finetune_Vidgen_model_reclaimed_language_tokenizer")

In [None]:
dataset

Unnamed: 0,sentences,hate_label,gold_label,dataset,bitch,drag,fag,gay,homo,other,puss,queen,queer,sissy,slay,slut,whore,RHS Model Prediction,Vidgen Model Prediction
0,BbBB...b.b..b.bb but OP's a faggot,1,DEG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
1,"it's a fucking crab, nobody cares faggot",1,DEG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
2,"&gt;Tasteless ""politically incorrect"" dark, of...",1,DEG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,Mass tagger seems pretty useful for knowing wh...,1,DEG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
4,Wild! Faggot was my nickname in high school! W...,0,NDG,kurrek,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2534,the devils been sending his workers to get at ...,1,DEG,kaggle,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
2535,the jews gon be laughin at a nigga on Judgemen...,1,DEG,kaggle,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2536,"this fat bitch was preaching like ""we have too...",1,DEG,kaggle,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
2537,those random kids who sit at the plaza all day...,1,DEG,kaggle,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1


Preparing the sentences and labels.

In [None]:
sentences = dataset['sentences'].to_list()
labels = dataset['hate_label'].to_list()

In [None]:
!pip install accelerate -U



In [None]:
!pip install transformers[torch]



In [None]:
import torch

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model = model.to(device)
model.eval()

tokenized_inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

Defining the Custom Dataloader

In [None]:
# Import necessary modules from the PyTorch library.
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class that inherits from the PyTorch Dataset class.
class HateSpeechDataset(Dataset):
    # Initialize the dataset object with tokenized input data and labels.
    def __init__(self, tokenized_inputs, labels):
        self.tokenized_inputs = tokenized_inputs  # Store the tokenized input data.
        self.labels = labels  # Store the corresponding labels.

    # Return the number of samples in the dataset.
    def __len__(self):
        return len(self.labels)

    # Fetch a sample from the dataset using an index.
    def __getitem__(self, idx):
        # Extract the tokenized input data for the given index.
        item = {key: val[idx] for key, val in self.tokenized_inputs.items()}
        # Add the corresponding label to the extracted data.
        item['labels'] = self.labels[idx]
        return item

# Set the batch size for DataLoader.
batch_size = 32

# Create an instance of the HateSpeechDataset with tokenized_inputs and labels.
dataset = HateSpeechDataset(tokenized_inputs, labels)

# Create a DataLoader instance to fetch batches from the dataset.
# shuffle=False ensures that the data is not shuffled when fetched.
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)


In [None]:
!pip install tqdm



Executing the training loop to get accuracy and F1 metrics

In [None]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_preds = []
all_labels = []

# Get predictions
with torch.no_grad():
    for batch in tqdm(dataloader):
          batch = {key: val.to(device) for key, val in batch.items()}
          logits = model(**batch).logits
          preds = torch.argmax(logits, dim=1).cpu().numpy()
          all_preds.extend(preds)
          all_labels.extend(batch['labels'].cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


100%|██████████| 80/80 [00:23<00:00,  3.34it/s]

Accuracy: 0.8625
F1 Score: 0.8804





In [None]:
df = pd.read_csv('Notebook_11_Finetune_testset.csv')

In [None]:
df = df.drop(columns=['RHS Model Prediction', 'Vidgen Model Prediction'])

Add the predictions of the RHS model to the dataframe.

In [None]:
df['RHS Model (No Cross_Validation)'] = all_preds

# Evaluation of RHS Model with Best Fold from Cross-Validation on Test Dataset

In [None]:
# 1. Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("EZiisk/EZ_finetune_Vidgen_model_RHS_Best")
tokenizer = AutoTokenizer.from_pretrained("EZiisk/EZ_finetune_Vidgen_model_RHS_Best_Tokenizer")

Downloading (…)lve/main/config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
# Move the model to the device
model = model.to(device)
model.eval()

tokenized_inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [None]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_preds2 = []
all_labels2 = []

# Get predictions
with torch.no_grad():
    for batch in tqdm(dataloader):
          batch = {key: val.to(device) for key, val in batch.items()}
          logits = model(**batch).logits
          preds = torch.argmax(logits, dim=1).cpu().numpy()
          all_preds2.extend(preds)
          all_labels2.extend(batch['labels'].cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels2, all_preds2)
f1 = f1_score(all_labels2, all_preds2)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


100%|██████████| 80/80 [00:24<00:00,  3.31it/s]

Accuracy: 0.8795
F1 Score: 0.8968





In [None]:
df['RHS Model (Best Fold)'] = all_preds2

# Evaluation of RHS Model with Averaged Weights from Cross-Validation on Test Dataset

In [None]:
# 1. Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("EZiisk/EZ_finetune_Vidgen_model_RHS_ensemble")
tokenizer = AutoTokenizer.from_pretrained("EZiisk/EZ_finetune_Vidgen_model_RHS_ensemble_tokenizer")

Downloading (…)lve/main/config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
# Move the model to the device
model = model.to(device)
model.eval()

tokenized_inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [None]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_preds3 = []
all_labels3 = []

# Get predictions
with torch.no_grad():
    for batch in tqdm(dataloader):
          batch = {key: val.to(device) for key, val in batch.items()}
          logits = model(**batch).logits
          preds = torch.argmax(logits, dim=1).cpu().numpy()
          all_preds3.extend(preds)
          all_labels3.extend(batch['labels'].cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels3, all_preds3)
f1 = f1_score(all_labels3, all_preds3)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


100%|██████████| 80/80 [00:25<00:00,  3.19it/s]

Accuracy: 0.8629
F1 Score: 0.8806





In [None]:
df['RHS Model (Averaged Weights)'] = all_preds3

# Evaluation of Vidgen Model on Test Dataset

In [None]:
# 1. Load the pre-trained model and tokenizer
model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
import torch

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model = model.to(device)
model.eval()

tokenized_inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [None]:
Vidgen_preds = []
Vidgen_labels = []

# Get predictions
with torch.no_grad():
    for batch in tqdm(dataloader):
          batch = {key: val.to(device) for key, val in batch.items()}
          logits = model(**batch).logits
          preds = torch.argmax(logits, dim=1).cpu().numpy()
          Vidgen_preds.extend(preds)
          Vidgen_labels.extend(batch['labels'].cpu().numpy())

# Compute metrics
accuracy = accuracy_score(Vidgen_labels, Vidgen_preds)
f1 = f1_score(Vidgen_labels, Vidgen_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


100%|██████████| 80/80 [00:25<00:00,  3.11it/s]

Accuracy: 0.7271
F1 Score: 0.8036





In [None]:
df['Vidgen Model Prediction'] = Vidgen_preds

In [None]:
from google.colab import files

df.to_csv('4_TESTSET_Comparison_Vidgen_RHS_Models.csv', index = False)


In [None]:
files.download('4_TESTSET_Comparison_Vidgen_RHS_Models.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df

Unnamed: 0,sentences,hate_label,gold_label,dataset,bitch,drag,fag,gay,homo,other,...,queen,queer,sissy,slay,slut,whore,RHS Model (No Cross_Validation),RHS Model (Best Fold),RHS Model (Averaged Weights),Vidgen Model Prediction
0,BbBB...b.b..b.bb but OP's a faggot,1,DEG,kurrek,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,1,1
1,"it's a fucking crab, nobody cares faggot",1,DEG,kurrek,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,1,1
2,"&gt;Tasteless ""politically incorrect"" dark, of...",1,DEG,kurrek,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,Mass tagger seems pretty useful for knowing wh...,1,DEG,kurrek,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,1,1
4,Wild! Faggot was my nickname in high school! W...,0,NDG,kurrek,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2534,the devils been sending his workers to get at ...,1,DEG,kaggle,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
2535,the jews gon be laughin at a nigga on Judgemen...,1,DEG,kaggle,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
2536,"this fat bitch was preaching like ""we have too...",1,DEG,kaggle,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
2537,those random kids who sit at the plaza all day...,1,DEG,kaggle,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,1,1
