In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.model_selection import train_test_split
import re
from pathlib import Path
from collections import Counter

# Load data

In [3]:
train_data = pd.read_csv(Path("data", "processed", "train.csv"))
val_data = pd.read_csv(Path("data", "processed", "val.csv"))

# Parameters

In [48]:
MIN_FREQ = 1 # 20
MAX_LEN = 20
EMBED_DIM = 50
NUM_CLASSES = 6 # toxic, severe_toxic, obscene, threat, insult, identity_hate
BATCH_SIZE = 2
EPOCHS = 1
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Tokenizer

In [None]:
# Prepare Tokenizer and util functions
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
    return text


def build_vocab(texts: list[str], min_freq: int=1) -> dict:
    token_counts = Counter()
    for text in texts:
        cleaned_text = clean_text(text)
        token_counts.update(cleaned_text.split())
    vocab = {word: idx + 2 for idx, (word, count) in enumerate(token_counts.items()) if count >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab


def tokenizer(text: str, vocab: dict, max_len: int) -> dict:
    cleaned_text = clean_text(text)
    tokens = [vocab.get(word, 1) for word in cleaned_text.split()[:max_len]]
    input_ids = tokens + [0] * (max_len - len(tokens))

    # Check if token exceeds the len of the voceb
    for token in input_ids:
        if token >= len(vocab):
            print(f"Warning: Token index {token} out of range!")
    
    return {'input_ids': torch.tensor(input_ids)}

In [31]:
print(train_data.comment_text[10])

Your recent edits, something to read, and a point of view 

Hi, please take the time to read Wikipedia:Guidance for younger editors when you have a moment. Please also be aware that it not only applies to things you post on Wikipedia, but also to things you ask others on Wikipedia.

Secondly, there is no minimum age to edit Wikipedia, and it certainly doesn't just happen to coincide conveniently with however old you happen to be today. Some 15 year olds are administrators, some people have been administrators and bureaucrats while aged 12, some 16 year olds and 64 year olds are banned from Wikipedia by the community. Actions, not numbers, are an indication of maturity.  (talk)


In [32]:
print(clean_text(train_data.comment_text[10]))

your recent edits something to read and a point of view 

hi please take the time to read wikipediaguidance for younger editors when you have a moment please also be aware that it not only applies to things you post on wikipedia but also to things you ask others on wikipedia

secondly there is no minimum age to edit wikipedia and it certainly doesnt just happen to coincide conveniently with however old you happen to be today some 15 year olds are administrators some people have been administrators and bureaucrats while aged 12 some 16 year olds and 64 year olds are banned from wikipedia by the community actions not numbers are an indication of maturity  talk


In [51]:
vocab_tmp = build_vocab(train_data.comment_text.to_list(), MIN_FREQ)

In [52]:
c = 0
for k,v in vocab_tmp.items():
    print(f"{k} - {v}")
    c += 1
    if c > 10:
        break

and - 2
that - 3
would - 4
verify - 5
john - 6
was - 7
a - 8
pratt - 9
grad - 10
w - 11
babs - 12


In [10]:
train_data.comment_text[0]

'.  And that would verify that John was a Pratt grad w/ BA/BS in Graphic Art'

In [53]:
tokenizer(train_data.comment_text[0], vocab_tmp, max_len=50)

{'input_ids': tensor([ 2,  3,  4,  5,  3,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])}

In [36]:
tokenizer("and that would verify that john was a pratt grad w babs in graphic art'", vocab_tmp, max_len=50)

{'input_ids': tensor([ 2,  3,  4,  5,  3,  6,  7,  8,  1, 10, 11,  1, 13, 14, 15,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])}

# Model

In [None]:
# Custom Dataset Class
class ToxicCommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = torch.tensor(self.labels[index], dtype=torch.float32)
        #encoded = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        encoded = self.tokenizer(text)
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            #'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': label
        }


# Model
class ToxicityClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, input_ids):
        embedded = self.embedding(input_ids).mean(dim=1)
        return self.fc(embedded)


# Training function
def train_model(model, dataloader, criterion, optimizer, epochs, device):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

# Training

In [55]:
train_input = train_data.comment_text.to_list()
train_labels = train_data.loc[:, ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()

In [15]:
train_input[:5]

['.  And that would verify that John was a Pratt grad w/ BA/BS in Graphic Art',
 'hi how are you  are you Mr bill \n\naoa \n       hi i am waseem 4rm pakistan n whats a maining of the The International Awareness\nPromotion Department Of\nE.A.A.S Lottery Headquarters\nEuro-Afro Asia Sweepstake lottery he says congratulations you have won US$250,000.00 ( (Two hundred and Fifty Thousand United States Dollars) in Cheque. pl z i have no idea tell me by this number 00923236916674 00923147007006  pless  pless pless  i shell b thank full to you',
 'Abi 17:45, 9 February 2014',
 'We can agree on one thing: the numbers do speak for themselves. The fact that they seem to be saying something else to you than to me, and some sources, is irrelevant.',
 'I noticed that on the media page there are only FM radio stations.  Can someone add some AM stations?']

In [16]:
train_labels[:5]

[[0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0]]

In [56]:
vocab = build_vocab(train_input, MIN_FREQ)

In [63]:
# Prepare dataset
dataset = ToxicCommentsDataset(train_input, train_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [64]:
model = ToxicityClassifier(vocab_size=len(vocab), embed_dim=EMBED_DIM, num_classes=NUM_CLASSES)
model.to(DEVICE)

# Loss
criterion = nn.BCEWithLogitsLoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

It takes ~30 minutes for 1 epoch with the cpu

In [65]:
train_model(model, dataloader, criterion, optimizer, EPOCHS, DEVICE)

Epoch 1, Loss: 0.08283394763837117


In [66]:
def predict(model, text, vocab, tokenizer, max_len, device, threshold=0.5):
    model.eval()  # Set model to evaluation mode
    
    # Preprocess the input
    encoded_input = tokenizer(text, vocab, max_len)
    input_ids = encoded_input['input_ids'].unsqueeze(0).to(device)  # Add batch dimension
    
    with torch.no_grad():  # Disable gradient calculation
        logits = model(input_ids)  # Get model outputs
        probabilities = torch.sigmoid(logits)  # Convert logits to probabilities
    
    # Apply threshold to get binary predictions
    predictions = (probabilities >= threshold).int()
    
    return probabilities.cpu().numpy(), predictions.cpu().numpy()

# Example usage:
test_text = "This is a rude and toxic comment"
probs, preds = predict(model, test_text, vocab, tokenizer, MAX_LEN, DEVICE)

print(f"Probabilities: {probs}")
print(f"Predictions: {preds}")


Probabilities: [[0.4004047  0.00357935 0.13037662 0.00194104 0.11506969 0.01509424]]
Predictions: [[0 0 0 0 0 0]]


In [68]:
probs, preds = predict(model, "fuck you", vocab, tokenizer, MAX_LEN, DEVICE)

print(f"Probabilities: {probs}")
print(f"Predictions: {preds}")

Probabilities: [[0.9971667  0.07377116 0.96073985 0.00332703 0.7870886  0.03911187]]
Predictions: [[1 0 1 0 1 0]]


In [74]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, dataloader, criterion, device, threshold=0.5):
    model.eval()
    total_loss = 0
    all_labels = []
    all_preds = []
    all_probs = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Apply sigmoid to convert logits to probabilities
            probs = torch.sigmoid(outputs)

            # Save predictions
            all_labels.append(labels.cpu())
            all_probs.append(probs.cpu())
            all_preds.append((probs >= threshold).int().cpu())

    # Concatenate all results
    all_labels = torch.cat(all_labels).numpy()
    all_probs = torch.cat(all_probs).numpy()
    all_preds = torch.cat(all_preds).numpy()

    # Calculate average loss
    avg_loss = total_loss / len(dataloader)

    # Compute Metrics (macro-averaged for multi-label tasks)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

    # AUC-ROC (for multi-label, compute per class and take average)
    auc_roc = roc_auc_score(all_labels, all_probs, average='macro')

    print(f"Loss: {avg_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")

    return {
        'loss': avg_loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc_roc': auc_roc
    }


In [76]:
# Prepare validation dataset
train_input = train_data.comment_text.to_list()
train_labels = train_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()
train_dataset = ToxicCommentsDataset(train_input, train_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Prepare validation dataset
val_input = val_data.comment_text.to_list()
val_labels = val_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()
val_dataset = ToxicCommentsDataset(val_input, val_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [77]:
train_metrics = evaluate_model(model, train_dataloader, criterion, DEVICE)

Loss: 0.0651
Accuracy: 0.9138
Precision: 0.8207
Recall: 0.3179
F1 Score: 0.4006
AUC-ROC: 0.9563


In [75]:
val_metrics = evaluate_model(model, val_dataloader, criterion, DEVICE)

Loss: 0.0718
Accuracy: 0.9096
Precision: 0.6427
Recall: 0.3000
F1 Score: 0.3787
AUC-ROC: 0.9471


In [99]:
text = train_data.comment_text[27]
print(text)
encoded_input = tokenizer(text, vocab, MAX_LEN)
input_ids = encoded_input['input_ids'].unsqueeze(0).to(DEVICE)

screw you
why dont you stick it up your fucking ass than lick it out, block it i dont give a shit you fucking bastard, suck my fucking BALLLLLSSSSSSS!!!!!!!!!!!!!!!


In [100]:
logits = model(input_ids)
print(logits)

tensor([[ 9.8165, -0.3227,  4.9869, -3.0962,  2.8289, -1.5816]],
       grad_fn=<AddmmBackward0>)


In [101]:
probabilities = torch.sigmoid(logits).squeeze(0)
print(probabilities)

tensor([0.9999, 0.4200, 0.9932, 0.0433, 0.9442, 0.1706],
       grad_fn=<SqueezeBackward1>)


In [102]:
predictions = (probabilities >= 0.5).int()
print(predictions)

tensor([1, 0, 1, 0, 1, 0], dtype=torch.int32)


In [103]:
torch.cat((predictions, predictions))

tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=torch.int32)

# FIN QUI

# Model with masked output

In [104]:
# Custom Loss with Masking
class HierarchicalBCELoss(nn.Module):
    def forward(self, outputs, labels):
        toxic_loss = nn.functional.binary_cross_entropy_with_logits(outputs[:, 0], labels[:, 0])

        # Mask sub-category losses if toxic == 0 (i.e., non-toxit text)
        mask = labels[:, 0] > 0 # Consider sub-categories only if toxic == 1 (i.e., toxic text)
        sub_loss = nn.functional.binary_cross_entropy_with_logits(outputs[:, 1:], labels[:, 1:], reduction='none')
        sub_loss = (sub_loss * mask.unsqueeze(1)).mean()

        return toxic_loss + sub_loss

In [105]:
# Prepare dataset
dataset = ToxicCommentsDataset(train_input, train_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [106]:
model_mo = ToxicityClassifier(vocab_size=len(vocab), embed_dim=EMBED_DIM, num_classes=NUM_CLASSES)
model_mo.to(DEVICE)

# Loss (custom)
criterion_mo = HierarchicalBCELoss()

# Optimizer
optimizer_mo = optim.Adam(model.parameters(), lr=0.001)

train_model(model_mo, dataloader, criterion_mo, optimizer_mo, EPOCHS, DEVICE)

KeyboardInterrupt: 

In [None]:
# Post-processing predictions
def postprocess_predictions(outputs):
    outputs = torch.sigmoid(outputs)
    toxic_pred = outputs[:, 0] >= 0.5

    # Zero out sub-categories if not toxic
    outputs[:, 1:] *= toxic_pred.unsqueeze(1)
    return outputs