# Load libraries and data

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.model_selection import train_test_split
import re
from pathlib import Path
from collections import Counter
import wandb
import random

In [27]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdaniele-didino[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
train_data = pd.read_csv(Path("..", "data", "processed", "train.csv"))
val_data = pd.read_csv(Path("..", "data", "processed", "val.csv"))

# Parameters & wandb

In [28]:
MIN_FREQ = 1 # 20
MAX_LEN = 20
EMBED_DIM = 50
NUM_CLASSES = 6 # toxic, severe_toxic, obscene, threat, insult, identity_hate
BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 0.001
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using {DEVICE}")

Using cuda


In [56]:
def random_hyperparams():
    return {
        "embedding_dim": random.choice([50, 100, 150]),
        "hidden_units": random.choice([64, 128, 256]),
        "num_layers": random.randint(1, 3),
        "dropout": random.uniform(0.2, 0.5),
        "learning_rate": 10 ** random.uniform(-4, -2),
        "batch_size": random.choice([16, 32, 64]),
        "num_classes": 6, # toxic, severe_toxic, obscene, threat, insult, identity_hate
        "epochs": 5 # random.randint(2, 5)
    }

In [29]:
run = wandb.init(
    # Set the project where this run will be logged
    project="toxic_comment_clf",
    # Track hyperparameters
    config={
        "learning_rate": LEARNING_RATE,
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "embed_dim": EMBED_DIM
    },
)

# Tokenizer

In [4]:
# Prepare Tokenizer and util functions
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
    return text


def build_vocab(texts: list[str], min_freq: int=1) -> dict:
    token_counts = Counter()
    for text in texts:
        cleaned_text = clean_text(text)
        token_counts.update(cleaned_text.split())
    vocab = {word: idx + 2 for idx, (word, count) in enumerate(token_counts.items()) if count >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab


def tokenizer(text: str, vocab: dict, max_len: int) -> dict:
    cleaned_text = clean_text(text)
    tokens = [vocab.get(word, 1) for word in cleaned_text.split()[:max_len]]
    input_ids = tokens + [0] * (max_len - len(tokens))

    # Check if token exceeds the len of the voceb
    for token in input_ids:
        if token >= len(vocab):
            print(f"Warning: Token index {token} out of range!")
    
    return {'input_ids': torch.tensor(input_ids)}

In [5]:
print(train_data.comment_text[10])

Your recent edits, something to read, and a point of view 

Hi, please take the time to read Wikipedia:Guidance for younger editors when you have a moment. Please also be aware that it not only applies to things you post on Wikipedia, but also to things you ask others on Wikipedia.

Secondly, there is no minimum age to edit Wikipedia, and it certainly doesn't just happen to coincide conveniently with however old you happen to be today. Some 15 year olds are administrators, some people have been administrators and bureaucrats while aged 12, some 16 year olds and 64 year olds are banned from Wikipedia by the community. Actions, not numbers, are an indication of maturity.  (talk)


In [6]:
print(clean_text(train_data.comment_text[10]))

your recent edits something to read and a point of view 

hi please take the time to read wikipediaguidance for younger editors when you have a moment please also be aware that it not only applies to things you post on wikipedia but also to things you ask others on wikipedia

secondly there is no minimum age to edit wikipedia and it certainly doesnt just happen to coincide conveniently with however old you happen to be today some 15 year olds are administrators some people have been administrators and bureaucrats while aged 12 some 16 year olds and 64 year olds are banned from wikipedia by the community actions not numbers are an indication of maturity  talk


In [7]:
vocab_tmp = build_vocab(train_data.comment_text.to_list(), MIN_FREQ)

In [8]:
c = 0
for k,v in vocab_tmp.items():
    print(f"{k} - {v}")
    c += 1
    if c > 10:
        break

and - 2
that - 3
would - 4
verify - 5
john - 6
was - 7
a - 8
pratt - 9
grad - 10
w - 11
babs - 12


In [9]:
train_data.comment_text[0]

'.  And that would verify that John was a Pratt grad w/ BA/BS in Graphic Art'

In [10]:
tokenizer(train_data.comment_text[0], vocab_tmp, max_len=50)

{'input_ids': tensor([ 2,  3,  4,  5,  3,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])}

In [11]:
tokenizer("and that would verify that john was a pratt grad w babs in graphic art'", vocab_tmp, max_len=50)

{'input_ids': tensor([ 2,  3,  4,  5,  3,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])}

# Dummy approach

In [None]:
def evaluate_dummy(df: pd.DataFrame) -> dict:
    dummy_pred = pd.DataFrame(
        0,
        index=df.index,
        columns=df.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].columns
    )

    df_labels = df.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.flatten()
    dummy_pred = dummy_pred.values.flatten()

    accuracy = accuracy_score(df_labels, dummy_pred)
    precision = precision_score(df_labels, dummy_pred, average='macro', zero_division=0)
    recall = recall_score(df_labels, dummy_pred, average='macro', zero_division=0)
    f1 = f1_score(df_labels, dummy_pred, average='macro', zero_division=0)

    # AUC-ROC (for multi-label, compute per class and take average)
    auc_roc = roc_auc_score(df_labels, dummy_pred, average='macro')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc_roc': auc_roc
    }

In [13]:
train_metrics_dummy = evaluate_dummy(train_data)

Accuracy: 0.9636
Precision: 0.4818
Recall: 0.5000
F1 Score: 0.4907
AUC-ROC: 0.5000


In [14]:
val_metrics_dummy = evaluate_dummy(val_data)

Accuracy: 0.9626
Precision: 0.4813
Recall: 0.5000
F1 Score: 0.4905
AUC-ROC: 0.5000


While **accuracy** is very high (~0.96),
an **AUC-ROC** around 0.5 indicates that this approach is equivalent to random guessing."

# Model

In [None]:
# Dataset Class
class ToxicCommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = torch.tensor(self.labels[index], dtype=torch.float32)
        encoded = self.tokenizer(text)
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'labels': label
        }


# Model
class ToxicityClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_units, num_layers, dropout, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        layers = []
        input_size = embed_dim
        for _ in range(num_layers):
            layers.append(nn.Linear(input_size, hidden_units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_size = hidden_units
        layers.append(nn.Linear(hidden_units, num_classes))
        self.fc = nn.Sequential(*layers)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids).mean(dim=1)
        return self.fc(embedded)


# Compute Loss and Metrics
def model_eval(model, dataloader, criterion, device, threshold=0.5):
    model.eval()  # set model to evaluation mode
    total_loss = 0
    all_labels = []
    all_preds = []
    all_probs = []

    with torch.no_grad():  # No gradients during evaluation
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Apply sigmoid to convert logits to probabilities
            probs = torch.sigmoid(outputs)

            # Save predictions
            all_labels.append(labels.cpu())
            all_probs.append(probs.cpu())
            all_preds.append((probs >= threshold).int().cpu())

    # Concatenate results
    all_labels = torch.cat(all_labels).numpy()
    all_probs = torch.cat(all_probs).numpy()
    all_preds = torch.cat(all_preds).numpy()

    # Calculate average loss
    avg_loss = total_loss / len(dataloader)
    
    # AUC-ROC (for multi-label, compute per class and take average)
    auc_roc = roc_auc_score(all_labels, all_probs, average='macro')

    return avg_loss, auc_roc


# Training function
def model_train(model, train_loader, val_loader, criterion, optimizer, epochs, device):
    model.to(device)

    for epoch in range(epochs):
        model.train()  # set model to training mode
        total_train_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        # Evaluate on validation set
        train_loss = total_train_loss / len(train_loader)
        _, train_auc_roc = model_eval(model, train_loader, criterion, device)
        val_loss, val_auc_roc = model_eval(model, val_loader, criterion, device)
        

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f} | AUC_ROC: {train_auc_roc:.4f}")
        print(f"Val Loss: {val_loss:.4f} | AUC_ROC: {val_auc_roc:.4f}")

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_auc_roc": train_auc_roc,
            "val_loss": val_loss,
            "val_auc_roc": val_auc_roc,
        })

    return val_auc_roc


In [59]:
train_input = train_data.comment_text.to_list()
train_labels = train_data.loc[:, ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()

val_input = val_data.comment_text.to_list()
val_labels = val_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()

vocab = build_vocab(train_input, MIN_FREQ)

# Prepare train dataset
train_dataset = ToxicCommentsDataset(train_input, train_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)

# Prepare validation dataset
val_dataset = ToxicCommentsDataset(val_input, val_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)

In [63]:
train_input[:5]

['.  And that would verify that John was a Pratt grad w/ BA/BS in Graphic Art',
 'hi how are you  are you Mr bill \n\naoa \n       hi i am waseem 4rm pakistan n whats a maining of the The International Awareness\nPromotion Department Of\nE.A.A.S Lottery Headquarters\nEuro-Afro Asia Sweepstake lottery he says congratulations you have won US$250,000.00 ( (Two hundred and Fifty Thousand United States Dollars) in Cheque. pl z i have no idea tell me by this number 00923236916674 00923147007006  pless  pless pless  i shell b thank full to you',
 'Abi 17:45, 9 February 2014',
 'We can agree on one thing: the numbers do speak for themselves. The fact that they seem to be saying something else to you than to me, and some sources, is irrelevant.',
 'I noticed that on the media page there are only FM radio stations.  Can someone add some AM stations?']

In [64]:
train_labels[:5]

[[0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0]]

Time required to train 1 epoch:
- CPU: ~ 30 minutes
- GPU: ~ 3 minutes

In [62]:
best_score = 0
best_params = {}

for trial in range(1):  # Run these searches
    wandb.init(project="toxic_comment_clf", reinit=True)

    # Sample hyperparameters randomly
    config = random_hyperparams()
    wandb.config.update(config)

    # Initialize DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=True)

    # Initialize model
    model = ToxicityClassifier(
        vocab_size=len(vocab),
        embed_dim=config["embedding_dim"],
        hidden_units=config["hidden_units"],
        num_layers=config["num_layers"],
        dropout=config["dropout"],
        num_classes=config["num_classes"])
    model.to(DEVICE)

    # Loss
    criterion = nn.BCEWithLogitsLoss()

    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])

    # Training loop
    val_auc_roc = model_train(model, train_dataloader, val_dataloader, criterion, optimizer, config["epochs"], DEVICE)

    # Store best model
    if val_auc_roc > best_score:
        best_score = val_auc_roc
        best_params = config

wandb.finish()
print(f"Best Model: {best_params} with AUC ROC: {best_score:.4f}")

Epoch 1/5
Train Loss: 0.0832 | AUC_ROC: 0.9765
Val Loss: 0.0682 | AUC_ROC: 0.9530
Epoch 2/5
Train Loss: 0.0560 | AUC_ROC: 0.9885
Val Loss: 0.0672 | AUC_ROC: 0.9541
Epoch 3/5
Train Loss: 0.0447 | AUC_ROC: 0.9936
Val Loss: 0.0754 | AUC_ROC: 0.9511
Epoch 4/5
Train Loss: 0.0366 | AUC_ROC: 0.9965
Val Loss: 0.0866 | AUC_ROC: 0.9448
Epoch 5/5
Train Loss: 0.0297 | AUC_ROC: 0.9978
Val Loss: 0.1037 | AUC_ROC: 0.9398


0,1
epoch,▁▃▅▆█
train_auc_roc,▁▅▇██
train_loss,█▄▃▂▁
val_auc_roc,▇█▇▃▁
val_loss,▁▁▃▅█

0,1
epoch,5.0
train_auc_roc,0.99779
train_loss,0.02968
val_auc_roc,0.93978
val_loss,0.10366


Best Model: {'embedding_dim': 100, 'hidden_units': 256, 'num_layers': 1, 'dropout': 0.3234595026691648, 'learning_rate': 0.005527264579039489, 'batch_size': 64, 'num_classes': 6, 'epochs': 5} with Accuracy: 0.9398


# Predict

In [26]:
def predict(model, text, vocab, tokenizer, max_len, device, threshold=0.5):
    model.eval()  # Set model to evaluation mode
    
    # Preprocess the input
    encoded_input = tokenizer(text, vocab, max_len)
    input_ids = encoded_input['input_ids'].unsqueeze(0).to(device)  # Add batch dimension
    
    with torch.no_grad():  # Disable gradient calculation
        logits = model(input_ids)  # model outputs
        probabilities = torch.sigmoid(logits)  # Convert logits to probabilities
    
    # Decision threshold
    predictions = (probabilities >= threshold).int()
    
    return probabilities.cpu().numpy(), predictions.cpu().numpy()

In [25]:
test_text = "You suck and I hate you"
probs, preds = predict(model, test_text, vocab, tokenizer, MAX_LEN, DEVICE)

print(f"Probabilities: {probs[0]}")
print(f"Predictions: {preds[0]}")

Probabilities: [0.98331773 0.09205111 0.83193743 0.05810835 0.7263957  0.07695656]
Predictions: [1 0 1 0 1 0]


In [27]:
probs, preds = predict(model, "fuck you", vocab, tokenizer, MAX_LEN, DEVICE)

print(f"Probabilities: {probs}")
print(f"Predictions: {preds}")

Probabilities: [[0.9722405  0.07568419 0.91668546 0.01601476 0.65795    0.04427878]]
Predictions: [[1 0 1 0 1 0]]


# Evaluate model

In [65]:
# Prepare train dataset
train_input = train_data.comment_text.to_list()
train_labels = train_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()
train_dataset = ToxicCommentsDataset(train_input, train_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Prepare validation dataset
val_input = val_data.comment_text.to_list()
val_labels = val_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()
val_dataset = ToxicCommentsDataset(val_input, val_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [69]:
avg_loss, auc_roc = model_eval(model, train_dataloader, criterion, DEVICE, threshold=0.5)
print(f"Loss: {avg_loss:.4f}")
print(f"AUC-ROC: {auc_roc:.4f}")

Loss: 0.0200
AUC-ROC: 0.9978


In [71]:
avg_loss, auc_roc = model_eval(model, val_dataloader, criterion, DEVICE, threshold=0.5)
print(f"Loss: {avg_loss:.4f}")
print(f"AUC-ROC: {auc_roc:.4f}")

Loss: 0.1037
AUC-ROC: 0.9398


In [72]:
text = train_data.comment_text[27]
print(text)
encoded_input = tokenizer(text, vocab, MAX_LEN)
input_ids = encoded_input['input_ids'].unsqueeze(0).to(DEVICE)

screw you
why dont you stick it up your fucking ass than lick it out, block it i dont give a shit you fucking bastard, suck my fucking BALLLLLSSSSSSS!!!!!!!!!!!!!!!


In [73]:
logits = model(input_ids)
print(logits)

tensor([[13.4365, -1.3871,  6.6582, -1.2496,  2.6401, -2.8238]],
       device='cuda:0', grad_fn=<AddmmBackward0>)


In [101]:
probabilities = torch.sigmoid(logits).squeeze(0)
print(probabilities)

tensor([0.9999, 0.4200, 0.9932, 0.0433, 0.9442, 0.1706],
       grad_fn=<SqueezeBackward1>)


In [102]:
predictions = (probabilities >= 0.5).int()
print(predictions)

tensor([1, 0, 1, 0, 1, 0], dtype=torch.int32)


In [103]:
torch.cat((predictions, predictions))

tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=torch.int32)