# Baseline model (RNN)

For the baseline model, we use an RNN model. 

We do not apply any data augmentation for this experiment.

## Import

In [None]:
# Install packages
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter

## Preparations in Colab

In [None]:
# Install package for kaggle API
!pip install -q kaggle

In [None]:
# Upload file
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"hansgntermayer","key":"70056ed66e5e2cb6698849d2d10a8db3"}'}

In [None]:
# Create directory
!mkdir -p ~/.kaggle
# Upload Kaggle credentials
!cp kaggle.json ~/.kaggle/
# Set permissions for file
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download kaggle dataset
!kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification

Downloading jigsaw-multilingual-toxic-comment-classification.zip to /content
100% 1.08G/1.08G [00:16<00:00, 76.1MB/s]
100% 1.08G/1.08G [00:16<00:00, 69.6MB/s]


In [None]:
# Unzip dataset
!unzip jigsaw-multilingual-toxic-comment-classification.zip

Archive:  jigsaw-multilingual-toxic-comment-classification.zip
  inflating: jigsaw-toxic-comment-train-processed-seqlen128.csv  
  inflating: jigsaw-toxic-comment-train.csv  
  inflating: jigsaw-unintended-bias-train-processed-seqlen128.csv  
  inflating: jigsaw-unintended-bias-train.csv  
  inflating: sample_submission.csv   
  inflating: test-processed-seqlen128.csv  
  inflating: test.csv                
  inflating: test_labels.csv         
  inflating: validation-processed-seqlen128.csv  
  inflating: validation.csv          


## Data Preprocessing

Due to long computation times, we only used 10,000 samples from the training set for this experiment.

In [None]:
# Import first 10,000 rows of training set with relevant columns
df = pd.read_csv("jigsaw-toxic-comment-train.csv", usecols=["comment_text", "toxic"])[:10000]
# Extract comment texts as Numpy array
texts = df["comment_text"].values
# Extract labels as Numpy array
labels = df["toxic"].values

# Create train / test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

len(df)

10000

# Vocabulary and Tokenize

In [None]:
# Function to create vocabulary
def build_vocab(texts, max_vocab_size=10000):
    # Tokenize and count words
    word_counts = Counter()
    for text in texts:
        tokens = text.lower().split()
        word_counts.update(tokens)

    # Keep most frequent words
    vocab = {word: idx+2 for idx, (word, _) in enumerate(word_counts.most_common(max_vocab_size))}
    vocab["<PAD>"] = 0  # Padding token
    vocab["<UNK>"] = 1  # Unknown token
    return vocab

# Create vocabulary for training data
vocab = build_vocab(train_texts)

# Tokenizing Function
def text_to_indices(text, vocab, max_length=100):
    tokens = text.lower().split()
    indices = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    # Truncate or pad sequences
    if len(indices) > max_length:
        return indices[:max_length]
    else:
        return indices + [vocab["<PAD>"]] * (max_length - len(indices))

# Tokenize training and test data
max_length = 100
train_sequences = [text_to_indices(text, vocab, max_length) for text in train_texts]
test_sequences = [text_to_indices(text, vocab, max_length) for text in test_texts]

# Convert to PyTorch tensors
train_data = torch.tensor(train_sequences, dtype=torch.long)
test_data = torch.tensor(test_sequences, dtype=torch.long)
train_labels = torch.tensor(train_labels, dtype=torch.float)
test_labels = torch.tensor(test_labels, dtype=torch.float)

## Dataset

In [None]:
# Define Dataset class for handling the datasets
class ToxicCommentDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

batch_size = 64

# Built datasets
train_dataset = ToxicCommentDataset(train_data, train_labels)
test_dataset = ToxicCommentDataset(test_data, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Baseline Model

In [None]:
# Define the RNN model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True, num_layers=n_layers)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # x shape: [batch_size, seq_length]
        embedded = self.embedding(x)  # [batch_size, seq_length, embedding_dim]
        output, hidden = self.rnn(embedded)
        # Take the last hidden state
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

# Define hyperparameters
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = 1

# Create model
model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

## Training

In [None]:
# Import package for ROC AUC score
from sklearn.metrics import roc_auc_score

# Use GPU, if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Put model on GPU, if available
model = model.to(device)

# Define loss function
criterion = nn.BCEWithLogitsLoss()
# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Define number of epochs
num_epochs = 5

# Perform training
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        sequences, labels = batch
        sequences, labels = sequences.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(sequences).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 0.2921
Epoch 2, Loss: 0.2337
Epoch 3, Loss: 0.5011
Epoch 4, Loss: 0.2174
Epoch 5, Loss: 0.2462


## Evaluate

In [None]:
# Put model in evaluation mode
model.eval()
# Create lists for probabilities and labels
all_probs = []
all_labels = []

# Evaluate on test set
with torch.no_grad():
    for batch in test_loader:
        sequences, labels = batch
        sequences, labels = sequences.to(device), labels.to(device)
        predictions = model(sequences).squeeze(1)
        probs = torch.sigmoid(predictions)  # Convert logits to probabilities
        all_probs.append(probs.cpu())
        all_labels.append(labels.cpu())

# Calculate ROC-AUC
all_probs = torch.cat(all_probs).numpy()
all_labels = torch.cat(all_labels).numpy()
auc = roc_auc_score(all_labels, all_probs)

print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}, Test AUC: {auc:.4f}")

Epoch 5, Loss: 0.2462, Test AUC: 0.5275


The ROC AUC score is bad. A value of 50% equals random guessing. The result of 52.75% is only slightly better. 
Note that we used a partition of the training data as test data for this experiment. That means, it is much more similar to the training data we used, than the actual test data of the Kaggle competition, in which the comments are in different languages.