In [1]:
import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


# HateXplain Dataset

### Content
- **Textual Data**: The dataset contains social media posts from Twitter and Gab.
- **Labels**: Each post is labeled as "normal," "offensive," or "hate," indicating the severity of the language.

### Annotations
- **Label Annotations**: Posts are annotated by multiple human annotators to ensure consistent labeling.
- **Rationales**: Annotators provide explanations highlighting specific parts of the text that influenced their labeling decision.
- **Target Communities**: Annotations include information on which communities or groups are targeted by the hate speech.

In [2]:
dataset = load_dataset("hatexplain")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 15383
    })
    validation: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 1922
    })
    test: Dataset({
        features: ['id', 'annotators', 'rationales', 'post_tokens'],
        num_rows: 1924
    })
})

# Preprocessing
- I'll classify both the offensiveness in a ordinal fashion
- and the targets of the sentence

In [3]:
X_train = dataset["train"]["post_tokens"]
X_train = [" ".join(sublist) for sublist in X_train]

y_train_lbl = dataset["train"]["annotators"]
y_train_lbl = [el["label"] for el in y_train_lbl]

y_train_targets = dataset["train"]["annotators"]
y_train_targets = [el["target"] for el in y_train_targets]

In [4]:
id1 = 140
id2 = 145

for el1, el2, el3 in zip(X_train[id1:id2],y_train_lbl[id1:id2],y_train_targets[id1:id2]):
    print(el1,el2,el3,sep="\n",end="\n\n")

lesson of the week from the holy book the sovereign individual once commerce flows over the internet suppression of speech becomes unstoppable devolution inevitable and the monopoly of violence by governments will end paraphrased from j d davidson rhys mogg
[1, 1, 1]
[['None'], ['None'], ['None']]

watch bbc on muslims abandoning eu <number> syria the father honestly says it bcs they do not like girls being told they have freewill by supposedly educated pple bcs women don t in islam sharia sharialaw realitycheck fgm misogyny hijab rapeculture abuse violence woman women girl girls womansrights womensrights brexit
[1, 1, 1]
[['Islam', 'Women'], ['Arab', 'Islam', 'Women'], ['Arab', 'Islam']]

<user> whatchu are <user> is a self hating homosexual who has to pay for your homosexual desires
[2, 2, 2]
[['Homosexual'], ['Homosexual'], ['Homosexual']]

the parade needed at least one severed head in honor of muslim culture
[0, 2, 0]
[['Islam'], ['Islam'], ['Islam']]

i look at the mudslime news 

I will average and normalize the label values. The actual situation is that for every sentence we have three different annotation which can take the 3 values:
- 0 - hatespeech
- 1 - normal
- 2 - offensive

I'll first reorder them, so that

- 2 = hate
- 1 = offensive
- 0 = normal

than, since for each sentence there are three different annotation, I'll average them and normalize in the range [0, 1]

(e.g., "they playing a lot of ethnic music at this white ass wedding" has labels [2, 1, 1] -> [1, 0, 0] -> 0.333 -> 0.165)

In [5]:
lbl_map = {
    0:2,
    1:0,
    2:1
}

def avg3(ls):
    return (lbl_map[ls[0]]+lbl_map[ls[1]]+lbl_map[ls[2]])/6

y_train_lbl = [avg3(x) for x in y_train_lbl]

A similar procedure should be done also for the targets, If two of the three annotators mentioned the same target I'll maintain it

In [6]:
# Done by ChatGPT 4o

# Prompt:
# Suppose you have three lists of classes,
# if the same class is in at least 2 classes
# you add it to a new list to return. give me the code

from collections import Counter

def find_common_classes(list1, list2, list3):
    # Combine all lists into one
    combined_list = list1 + list2 + list3
    
    # Create a counter to count occurrences of each element
    counter = Counter(combined_list)
    
    # Create a list to store elements that appear in at least two of the lists
    result = [item for item, count in counter.items() if count >= 2]
    
    return result

In [7]:
y_train_targets = [find_common_classes(trgts[0],trgts[1],trgts[2]) for trgts in y_train_targets]

Sample example of the changed dataset

In [8]:
id1 = 140
id2 = 145

for sen, lbl, targets in zip(X_train[id1:id2],y_train_lbl[id1:id2],y_train_targets[id1:id2]):
    print(sen,lbl,targets,end="\n\n",sep="\n")

lesson of the week from the holy book the sovereign individual once commerce flows over the internet suppression of speech becomes unstoppable devolution inevitable and the monopoly of violence by governments will end paraphrased from j d davidson rhys mogg
0.0
['None']

watch bbc on muslims abandoning eu <number> syria the father honestly says it bcs they do not like girls being told they have freewill by supposedly educated pple bcs women don t in islam sharia sharialaw realitycheck fgm misogyny hijab rapeculture abuse violence woman women girl girls womansrights womensrights brexit
0.0
['Islam', 'Women', 'Arab']

<user> whatchu are <user> is a self hating homosexual who has to pay for your homosexual desires
0.5
['Homosexual']

the parade needed at least one severed head in honor of muslim culture
0.8333333333333334
['Islam']

i look at the mudslime news from europe and think i wish some goat fucker would try that shit down here in louisiana just get within <number> feet of my daugh

In [9]:
classes = set()
for ls in y_train_targets:
    for el in ls:
        classes.add(el)

print(classes)
print(len(classes))

{'Caucasian', 'Islam', 'Jewish', 'Men', 'Hindu', 'Christian', 'Indigenous', 'Asian', 'Economic', 'Homosexual', 'Indian', 'Hispanic', 'Women', 'African', 'Arab', 'Other', 'Buddhism', 'Refugee', 'Disability', 'None'}
20


# Tokenizer


In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

texts = X_train
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)

input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']



In [11]:
print(X_train[0:2])

['u really think i would not have been raped by feral hindu or muslim back in india or bangladesh and a neo nazi would rape me as well just to see me cry', 'the uk has threatened to return radioactive waste to the eu if an agreement cannot be reached can not we keep the radioactive waste and send back all the paki migrants instead']


In [12]:
input_ids[1]

tensor([  101,  1996,  2866,  2038,  5561,  2000,  2709, 17669,  5949,  2000,
         1996,  7327,  2065,  2019,  3820,  3685,  2022,  2584,  2064,  2025,
         2057,  2562,  1996, 17669,  5949,  1998,  4604,  2067,  2035,  1996,
        22190,  2072, 16836,  2612,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [13]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import DataLoader, Dataset
import gc

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0)

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example data
texts = ["This is a sample sentence.", "Each text is one item in the dataset.", "Handle larger datasets with DataLoader."] * 1000  # Example large dataset

# Create dataset and dataloader
dataset = TextDataset(texts, tokenizer)
batch_size = 256
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers = 8)

# Store embeddings
all_embeddings = []

# Process data in batches
model.eval()
with torch.no_grad():
    for input_ids, attention_mask in data_loader:
        # Move input tensors to GPU
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get the embeddings for [CLS] token (first token)
        embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu()
        all_embeddings.append(embeddings)
        
        # Free up memory
        del input_ids, attention_mask, outputs, embeddings
        torch.cuda.empty_cache()
        gc.collect()

# Concatenate all embeddings
all_embeddings = torch.cat(all_embeddings, dim=0)



torch.Size([3000, 768])