In [1]:
# Install necessary packages
!pip install transformers
!pip install datasets
!pip install scikit-learn
!pip install kaggle

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [2]:
# upload personal kaggle.json file, get dataset from kaggle
# dataset link: https://www.kaggle.com/datasets/usharengaraju/dynamically-generated-hate-speech-dataset
from google.colab import files
files.upload()  # Upload kaggle.json file

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d usharengaraju/dynamically-generated-hate-speech-dataset
!unzip dynamically-generated-hate-speech-dataset.zip -d hate_speech_data

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/usharengaraju/dynamically-generated-hate-speech-dataset
License(s): other
Archive:  dynamically-generated-hate-speech-dataset.zip
  inflating: hate_speech_data/2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv  
  inflating: hate_speech_data/2020-12-31-DynamicallyGeneratedHateDataset-targets-v0.1.csv  


In [3]:
# Data processing to generate intersectional labels
# credit: Andy Liang

import pandas as pd
from sentence_transformers import SentenceTransformer

df1 = pd.read_csv('hate_speech_data/2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv')
df2 = pd.read_csv('hate_speech_data/2020-12-31-DynamicallyGeneratedHateDataset-targets-v0.1.csv')

df1['id'] = df1['id'].astype(str).str.strip()
df2['id'] = df2['id'].astype(str).str.strip()

target_columns = df2.columns.drop(['id'])
target_counts = df2[target_columns].sum(axis=1)
is_multiple_targets = target_counts > 1
is_other_flagged = df2['other'] == 1
is_gaywom = df2['gaywom'] == 1
is_blawom = df2['blawom'] == 1
is_asiwom = df2['asiwom'] == 1
is_muswom = df2['muswom'] == 1
is_indigwom = df2['indigwom'] == 1
is_mixed = df2['mixed'] == 1
df2['is_intersectional'] = (is_multiple_targets | is_other_flagged | is_gaywom | is_blawom | is_asiwom | is_muswom | is_indigwom | is_mixed).astype(int)

intersectional_count = (df2['is_intersectional'] == 1).sum()
print(f"Number of intersectional rows: {intersectional_count}")

merged_df = pd.merge(df1, df2, on='id', how='inner')
merged_df = merged_df.dropna(subset=['text', 'is_intersectional'])
merged_df.to_csv('processed_data.csv', index=False)

texts = merged_df['text'].astype(str)
labels = merged_df['is_intersectional'].astype(int)

  df2 = pd.read_csv('hate_speech_data/2020-12-31-DynamicallyGeneratedHateDataset-targets-v0.1.csv')


Number of intersectional rows: 1993


In [5]:
# Read processed data into a pandas dataframe
import pandas as pd

df = pd.read_csv("processed_data.csv")
print(df.head())
print(df['is_intersectional'].value_counts())

   Unnamed: 0     id                                               text label  \
0           1  10000  It should be legal for a husband to rape his wife  hate   
1           2  10001                            dalits are lowlife scum  hate   
2           3  10002                                dalits are lowlives  hate   
3           4  10003  It was a better world when women dare not ques...  hate   
4           5  10004  I don't work this hard so that those immigrant...  hate   

       type model_wrong  db.model_preds           status round  split  ...  \
0  notgiven        True         0.97457  dynabench entry     1  train  ...   
1  notgiven       False         0.08233  dynabench entry     1   test  ...   
2  notgiven        True         0.92319  dynabench entry     1  train  ...   
3  notgiven        True         0.99006  dynabench entry     1   test  ...   
4  notgiven        True         0.98836  dynabench entry     1  train  ...   

  eastern european  working  african  russia

In [6]:
# rename and encode labels, split dataframe into train/test
from sklearn.model_selection import train_test_split

df = df.rename(columns={'is_intersectional': 'labels', 'text': 'text'})

label_map = {label: idx for idx, label in enumerate(df['labels'].unique())}
df['labels'] = df['labels'].map(label_map)

train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df['labels'].tolist(), test_size=0.2, random_state=42)

In [8]:
#tokenize
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
#convert to dataset
import torch

class HateSpeechDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)


In [22]:
# load pretrained BERT model and set up training args
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    warmup_steps=0,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="no",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# train BERT classifier!
import os
os.environ["WANDB_DISABLED"] = "true" # disable WANDB
os.environ["WANDB_MODE"] = "disabled"

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0941,0.147921


TrainOutput(global_step=4063, training_loss=0.17073573838115297, metrics={'train_runtime': 473.8249, 'train_samples_per_second': 68.587, 'train_steps_per_second': 8.575, 'total_flos': 2137645769272320.0, 'train_loss': 0.17073573838115297, 'epoch': 1.0})

In [24]:
# evaluate results using trainer evaluation function
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.1479213684797287, 'eval_runtime': 27.7704, 'eval_samples_per_second': 292.578, 'eval_steps_per_second': 4.573, 'epoch': 1.0}


In [25]:
# get more eval metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

model.eval()

preds = []
labels = []

for batch in torch.utils.data.DataLoader(val_dataset, batch_size=32):
    batch = {k: v.to(model.device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    preds.extend(predictions.cpu().numpy())
    labels.extend(batch['labels'].cpu().numpy())

accuracy = accuracy_score(labels, preds)
precision = precision_score(labels, preds, average='weighted', zero_division=0)
recall = recall_score(labels, preds, average='weighted', zero_division=0)
f1 = f1_score(labels, preds, average='weighted', zero_division=0)

print(f"Validation Accuracy:  {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall:    {recall:.4f}")
print(f"Validation F1 Score:  {f1:.4f}")

Validation Accuracy:  0.9536
Validation Precision: 0.9399
Validation Recall:    0.9536
Validation F1 Score:  0.9418


**Code below this is to evaluate single samples for sanity testing and proof-of-concept examples**

In [29]:
import torch

# Example input list
texts = [
    "I hate Black women",             # Intersectional
    "I hate Indigenous women",        # Intersectional
    "I hate Jewish women",            # Intersectional
    "I hate Asian women",             # Intersectional
    "I hate Muslim women",            # Intersectional
    "Black women are terrible",       # Intersectional
    "Indigenous women are terrible",  # Intersectional
    "Jewish women are terrible",      # Intersectional
    "Asian women are terrible",       # Intersectional
    "Muslim women are terrible",      # Intersectional
    "I hate white women",             # NOT Intersectional
    "I hate Black men",               # NOT Intersectional
    "I hate white men",               # NOT Intersectional
    "I hate Indigenous men",          # NOT Intersectional
    "I hate Jewish men",              # NOT Intersectional
    "I hate Asian men",               # NOT Intersectional
    "I hate Muslim men",              # NOT Intersectional
    "White women are terrible",       # NOT Intersectional
    "Black men are terrible",         # NOT Intersectional
    "White men are terrible",         # NOT Intersectional
    "Indigenous men are terrible",    # NOT Intersectional
    "Jewish men are terrible",        # NOT Intersectional
    "Asian men are terrible",         # NOT Intersectional
    "Muslim men are terrible",        # NOT Intersectional

]

encodings = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=64)
encodings = {k: v.to(model.device) for k, v in encodings.items()}

model.eval()
with torch.no_grad():
    outputs = model(**encodings)
    logits = outputs.logits
    predicted_class_ids = torch.argmax(logits, dim=-1).cpu().numpy()

inv_label_map = {v: k for k, v in label_map.items()}
predicted_labels = [inv_label_map[i] for i in predicted_class_ids]

for text, label in zip(texts, predicted_labels):
    print(f"'{text}': Predicted label: {label}")

'I hate Black women': Predicted label: 1
'I hate Indigenous women': Predicted label: 1
'I hate Jewish women': Predicted label: 0
'I hate Asian women': Predicted label: 0
'I hate Muslim women': Predicted label: 0
'Black women are terrible': Predicted label: 1
'Indigenous women are terrible': Predicted label: 1
'Jewish women are terrible': Predicted label: 0
'Asian women are terrible': Predicted label: 0
'Muslim women are terrible': Predicted label: 0
'I hate white women': Predicted label: 0
'I hate Black men': Predicted label: 0
'I hate white men': Predicted label: 0
'I hate Indigenous men': Predicted label: 0
'I hate Jewish men': Predicted label: 0
'I hate Asian men': Predicted label: 0
'I hate Muslim men': Predicted label: 0
'White women are terrible': Predicted label: 0
'Black men are terrible': Predicted label: 0
'White men are terrible': Predicted label: 0
'Indigenous men are terrible': Predicted label: 0
'Jewish men are terrible': Predicted label: 0
'Asian men are terrible': Predi