# Load libraries and data

In [1]:
from pathlib import Path
import re
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score, roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import wandb

In [2]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdaniele-didino[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
train_data = pd.read_csv(Path("..", "data", "processed", "train.csv"))
val_data = pd.read_csv(Path("..", "data", "processed", "val.csv"))

# Parameters & wandb

In [4]:
MIN_FREQ = 1 # 20
MAX_LEN = 20
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using {DEVICE}")

Using cuda


# Tokenizer

In [5]:
# Prepare Tokenizer and util functions
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
    return text


def build_vocab(texts: list[str], min_freq: int=1) -> dict:
    token_counts = Counter()
    for text in texts:
        cleaned_text = clean_text(text)
        token_counts.update(cleaned_text.split())
    vocab = {word: idx + 2 for idx, (word, count) in enumerate(token_counts.items()) if count >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab


def tokenizer(text: str, vocab: dict, max_len: int) -> dict:
    cleaned_text = clean_text(text)
    tokens = [vocab.get(word, 1) for word in cleaned_text.split()[:max_len]]
    input_ids = tokens + [0] * (max_len - len(tokens))

    # Check if token exceeds the len of the voceb
    for token in input_ids:
        if token >= len(vocab):
            print(f"Warning: Token index {token} out of range!")
    
    return {'input_ids': torch.tensor(input_ids)}

In [6]:
print(train_data.comment_text[10])

Your recent edits, something to read, and a point of view 

Hi, please take the time to read Wikipedia:Guidance for younger editors when you have a moment. Please also be aware that it not only applies to things you post on Wikipedia, but also to things you ask others on Wikipedia.

Secondly, there is no minimum age to edit Wikipedia, and it certainly doesn't just happen to coincide conveniently with however old you happen to be today. Some 15 year olds are administrators, some people have been administrators and bureaucrats while aged 12, some 16 year olds and 64 year olds are banned from Wikipedia by the community. Actions, not numbers, are an indication of maturity.  (talk)


In [7]:
print(clean_text(train_data.comment_text[10]))

your recent edits something to read and a point of view 

hi please take the time to read wikipediaguidance for younger editors when you have a moment please also be aware that it not only applies to things you post on wikipedia but also to things you ask others on wikipedia

secondly there is no minimum age to edit wikipedia and it certainly doesnt just happen to coincide conveniently with however old you happen to be today some 15 year olds are administrators some people have been administrators and bureaucrats while aged 12 some 16 year olds and 64 year olds are banned from wikipedia by the community actions not numbers are an indication of maturity  talk


In [8]:
vocab_tmp = build_vocab(train_data.comment_text.to_list(), MIN_FREQ)

In [9]:
c = 0
for k,v in vocab_tmp.items():
    print(f"{k} - {v}")
    c += 1
    if c > 10:
        break

and - 2
that - 3
would - 4
verify - 5
john - 6
was - 7
a - 8
pratt - 9
grad - 10
w - 11
babs - 12


In [10]:
train_data.comment_text[0]

'.  And that would verify that John was a Pratt grad w/ BA/BS in Graphic Art'

In [11]:
tokenizer(train_data.comment_text[0], vocab_tmp, max_len=50)

{'input_ids': tensor([ 2,  3,  4,  5,  3,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])}

In [12]:
tokenizer("and that would verify that john was a pratt grad w babs in graphic art'", vocab_tmp, max_len=50)

{'input_ids': tensor([ 2,  3,  4,  5,  3,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])}

# Dummy approach

In [13]:
def evaluate_dummy(df: pd.DataFrame) -> dict:
    dummy_pred = pd.DataFrame(
        0,
        index=df.index,
        columns=df.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].columns
    )

    df_labels = df.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.flatten()
    dummy_pred = dummy_pred.values.flatten()

    accuracy = accuracy_score(df_labels, dummy_pred)

    # AUC-ROC (for multi-label, compute per class and take average)
    auc_roc = roc_auc_score(df_labels, dummy_pred, average='macro')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")

    return {
        'accuracy': accuracy,
        'auc_roc': auc_roc
    }

In [14]:
train_metrics_dummy = evaluate_dummy(train_data)

Accuracy: 0.9636
AUC-ROC: 0.5000


In [15]:
val_metrics_dummy = evaluate_dummy(val_data)

Accuracy: 0.9626
AUC-ROC: 0.5000


While **accuracy** is very high (~0.96),
an **AUC-ROC** around 0.5 indicates that this approach is equivalent to random guessing."

# Train Models

In [16]:
# Dataset Class
class ToxicCommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = torch.tensor(self.labels[index], dtype=torch.float32)
        encoded = self.tokenizer(text)
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'labels': label
        }


# Dense Model
class DenseModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_units, num_layers, dropout, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        layers = []
        input_size = embed_dim
        for _ in range(num_layers):
            layers.append(nn.Linear(input_size, hidden_units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_size = hidden_units
        layers.append(nn.Linear(hidden_units, num_classes))
        self.fc = nn.Sequential(*layers)

    def forward(self, x):
        embedded = self.embedding(x).mean(dim=1)
        return self.fc(embedded)


# GRU Model
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, gru_units, dense_units, num_layers, dropout, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, gru_units, batch_first=True)
        layers = []
        input_size = gru_units
        for _ in range(num_layers):
            layers.append(nn.Linear(input_size, dense_units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_size = dense_units
        layers.append(nn.Linear(dense_units, num_classes))
        self.fc = nn.Sequential(*layers)

    def forward(self, x):
        x = self.embedding(x).mean(dim=1) # (batch, seq_len, embedding_dim)
        x, _ = self.gru(x) # (batch, seq_len, hidden_units) if num_layers > 1
        if len(x.shape) == 2:
             # Ensure it is (batch, seq_len, hidden_units)
            x = x.unsqueeze(1)
        x = x[:, -1, :]  # Take the last time step
        return self.fc(x)


# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, dropout, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x).permute(1, 0, 2)
        x = self.transformer(x)
        x = x.mean(dim=0)  # Global average pooling
        return self.fc(x)


# Initialize the model
def build_model(config, vocab_size, num_classes):
    model_type = config.model_type
    if model_type == "Dense":
        return DenseModel(
            vocab_size=vocab_size,
            embed_dim=config.embed_dim,
            hidden_units=config.hidden_units,
            num_layers=config.num_layers,
            dropout=config.dropout,
            num_classes=num_classes)
    elif model_type == "GRU":
        return GRUModel(
            vocab_size=vocab_size,
            embed_dim=config.embed_dim,
            gru_units=config.gru_units,
            dense_units=config.dense_units,
            num_layers=config.num_layers,
            dropout=config.dropout,
            num_classes=num_classes)
    elif model_type == "Transformer":
        return TransformerModel(
            vocab_size=vocab_size,
            embed_dim=config.embed_dim,
            num_heads=config.num_heads,
            num_layers=config.num_layers,
            dropout=config.dropout,
            num_classes=num_classes)
    else:
        raise ValueError(f"Unknown model type: {model_type}")


# Compute Loss and Metrics
def model_eval(model, dataloader, criterion, device, threshold=0.5):
    model.eval()  # set model to evaluation mode
    total_loss = 0
    all_labels = []
    all_preds = []
    all_probs = []

    with torch.no_grad():  # No gradients during evaluation
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Apply sigmoid to convert logits to probabilities
            probs = torch.sigmoid(outputs)

            # Save predictions
            all_labels.append(labels.cpu())
            all_probs.append(probs.cpu())
            all_preds.append((probs >= threshold).int().cpu())

    # Concatenate results
    all_labels = torch.cat(all_labels).numpy()
    all_probs = torch.cat(all_probs).numpy()
    all_preds = torch.cat(all_preds).numpy()

    # Calculate average loss
    avg_loss = total_loss / len(dataloader)
    
    # AUC-ROC (for multi-label, compute per class and take average)
    auc_roc = roc_auc_score(all_labels, all_probs, average='macro')

    return avg_loss, auc_roc


# Training function
def model_train(model, train_loader, val_loader, criterion, optimizer, epochs, device):
    model.to(device)

    for epoch in range(epochs):
        model.train()  # set model to training mode
        total_train_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        # Evaluate on validation set
        train_loss = total_train_loss / len(train_loader)
        _, train_auc_roc = model_eval(model, train_loader, criterion, device)
        val_loss, val_auc_roc = model_eval(model, val_loader, criterion, device)
        
        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f} | AUC_ROC: {train_auc_roc:.4f}")
        print(f"Val Loss: {val_loss:.4f} | AUC_ROC: {val_auc_roc:.4f}")

        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_auc_roc": train_auc_roc,
            "val_loss": val_loss,
            "val_auc_roc": val_auc_roc,
        })

    return val_auc_roc


In [17]:
train_input = train_data.comment_text.to_list()
train_labels = train_data.loc[:, ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()

val_input = val_data.comment_text.to_list()
val_labels = val_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()

vocab = build_vocab(train_input, MIN_FREQ)

# Prepare train dataset
train_dataset = ToxicCommentsDataset(train_input, train_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)

# Prepare validation dataset
val_dataset = ToxicCommentsDataset(val_input, val_labels, lambda text: tokenizer(text, vocab, MAX_LEN), MAX_LEN)

In [18]:
train_input[:5]

['.  And that would verify that John was a Pratt grad w/ BA/BS in Graphic Art',
 'hi how are you  are you Mr bill \n\naoa \n       hi i am waseem 4rm pakistan n whats a maining of the The International Awareness\nPromotion Department Of\nE.A.A.S Lottery Headquarters\nEuro-Afro Asia Sweepstake lottery he says congratulations you have won US$250,000.00 ( (Two hundred and Fifty Thousand United States Dollars) in Cheque. pl z i have no idea tell me by this number 00923236916674 00923147007006  pless  pless pless  i shell b thank full to you',
 'Abi 17:45, 9 February 2014',
 'We can agree on one thing: the numbers do speak for themselves. The fact that they seem to be saying something else to you than to me, and some sources, is irrelevant.',
 'I noticed that on the media page there are only FM radio stations.  Can someone add some AM stations?']

In [19]:
train_labels[:5]

[[0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0]]

# W&B Sweeps

In [20]:
sweep_config = {
    "method": "bayes", # "random" or "grid" or "bayes"
    "metric": {"name": "val_auc_roc", "goal": "maximize"},
    "parameters": {
        "model_type": {"values": ["Dense", "GRU", "Transformer"]},

        # Shared parameters
        "embed_dim": {"values": [50, 100, 200]},
        "dropout": {"min": 0.2, "max": 0.5},
        "learning_rate": {"min": 1e-4, "max": 1e-2, "distribution": "log_uniform_values"},
        "batch_size": {"values": [16, 32, 64]},
        "num_layers": {"min": 1, "max": 3},
        "epochs": {"min": 1, "max": 5},
        
        # Dense        
        "hidden_units": {"values": [64, 128, 256]},
        
        # GRU
        "gru_units": {"min": 64, "max": 512},
        "dense_units": {"min": 32, "max": 256},
        
        # Transformer
        "num_heads": {"values": [2, 4, 8]},
        "num_layers": {"values": [2, 4, 6]}
    }
}

# Create the sweep
sweep_id = wandb.sweep(sweep_config, project="toxic_comment_clf")

# Define the training function
def train_sweep():

    num_classes = 6  # toxic, severe_toxic, obscene, threat, insult, identity_hate

    with wandb.init() as run:
        config = wandb.config # sample hyperparameters
        
        # Initialize DataLoaders
        train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=True)
    
        # Initialize model
        model = build_model(config, vocab_size=len(vocab), num_classes=num_classes)
        model.to(DEVICE)
    
        # Loss
        criterion = nn.BCEWithLogitsLoss()
    
        # Optimizer
        optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    
        # Training loop
        _ = model_train(model, train_dataloader, val_dataloader, criterion, optimizer, config["epochs"], DEVICE)

Create sweep with ID: rimnpuii
Sweep URL: https://wandb.ai/daniele-didino/toxic_comment_clf/sweeps/rimnpuii


In [21]:
# Launch the sweep
wandb.agent(sweep_id, function=train_sweep, count=20)

[34m[1mwandb[0m: Agent Starting Run: pu1t2df5 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dense_units: 238
[34m[1mwandb[0m: 	dropout: 0.4645723405914543
[34m[1mwandb[0m: 	embed_dim: 200
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	gru_units: 104
[34m[1mwandb[0m: 	hidden_units: 128
[34m[1mwandb[0m: 	learning_rate: 0.004106593833741538
[34m[1mwandb[0m: 	model_type: Transformer
[34m[1mwandb[0m: 	num_heads: 4
[34m[1mwandb[0m: 	num_layers: 2


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
