# Load libraries and data

In [None]:
from pathlib import Path
import re
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score

In [2]:
train_data = pd.read_csv(Path("..", "data", "processed", "train.csv"))
val_data = pd.read_csv(Path("..", "data", "processed", "val.csv"))
test_data = pd.read_csv(Path("..", "data", "processed", "test.csv"))

# Functions & Classes

In [3]:
def load_glove_embeddings(filepath, vocab, embedding_dim=100):
    embeddings = np.random.uniform(-0.25, 0.25, (len(vocab), embedding_dim))
    with open(filepath, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            if word in vocab:
                embeddings[vocab[word]] = vector
    return torch.tensor(embeddings, dtype=torch.float32)


def tokenizer(text, vocab, max_len=150):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
    tokens = text.split()
    encoded = [vocab.get(word, vocab["<UNK>"]) for word in tokens[:max_len]]
    encodec_padded = np.pad(encoded, (0, max_len - len(encoded)), constant_values=vocab["<PAD>"])[:max_len]
    return torch.tensor(encodec_padded).unsqueeze(0)


class ToxicClassifier(nn.Module):
    def __init__(self, embedding_matrix, embedding_dim, hidden_dim, num_filters, kernel_size, dropout, num_classes):
        super().__init__()
        # Embedding layers
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        # CNN layer
        self.conv = nn.Conv1d(
            in_channels=embedding_dim,
            out_channels=num_filters,
            kernel_size=kernel_size,
            padding=1)
        self.pool = nn.AdaptiveMaxPool1d(50) # This reduces the sequence length
        # GRU layer
        self.gru = nn.GRU(
            input_size=num_filters,
            hidden_size=hidden_dim,
            batch_first=True,
            bidirectional=True)
        # Fully connected layer
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x) # (batch_size, seq_len, embedding_dim)
        x = x.permute(0, 2, 1) # change shape for conv1d (batch_size, channels, seq_len)
        x = torch.relu(self.conv(x))
        x = self.pool(x)
        x = x.permute(0, 2, 1) # change shape back for GRU (batch_size, seq_len, channels)
        x, _ = self.gru(x)
        x = self.dropout(x[:, -1, :]) # take the last time step
        return self.fc(x)


# Load

In [None]:
# Load config
config = json.load(open("../model/config.json"))

# Load vocab
vocab = json.load(open("../model/vocab.json"))

# Load embedding matrix
embedding_matrix = load_glove_embeddings("../embedding/glove.6B.100d.txt", vocab, config["embedding_dim"])

# Recreate the model with saved hyperparameters
model = ToxicClassifier(
    embedding_matrix,
    embedding_dim=config["embedding_dim"],
    hidden_dim=config["hidden_dim"],
    num_filters=config["num_filters"],
    kernel_size=config["kernel_size"],
    dropout=config["dropout"],
    num_classes=config["num_classes"]
)

# Using CPU
model.to("cpu")

# Load weights
model.load_state_dict(torch.load("../model/model.pth"))
model.eval()  # Set to evaluation mode

ToxicClassifier(
  (embedding): Embedding(184223, 100)
  (conv): Conv1d(100, 64, kernel_size=(2,), stride=(1,), padding=(1,))
  (pool): AdaptiveMaxPool1d(output_size=50)
  (gru): GRU(64, 64, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.44, inplace=False)
  (fc): Linear(in_features=128, out_features=6, bias=True)
)

## Evaluate model

In [5]:
# Encode text into numerical sequences
def encode_text(text, vocab, max_len=150):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Remove special characters
    tokens = text.split()
    encoded = [vocab.get(word, vocab["<UNK>"]) for word in tokens[:max_len]]
    return np.pad(encoded, (0, max_len - len(encoded)), constant_values=vocab["<PAD>"])[:max_len]


# Dataset Class
class ToxicDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=150):
        texts = [encode_text(text, vocab, max_len) for text in texts]
        self.texts = [torch.tensor(text, dtype=torch.long) for text in texts]
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            "input_ids": self.texts[idx],
            "labels": self.labels[idx]
        }


train_input = train_data.comment_text.to_list()
train_labels = train_data.loc[:, ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()
val_input = val_data.comment_text.to_list()
val_labels = val_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()
test_input = test_data.comment_text.to_list()
test_labels = test_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()

train_dataset = ToxicDataset(train_input, train_labels, vocab, config["max_len"])
val_dataset = ToxicDataset(val_input, val_labels, vocab, config["max_len"])
test_dataset = ToxicDataset(test_input, test_labels, vocab, config["max_len"])

In [6]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [63]:
def evaluate_model(dataloader, model):
    all_preds = []
    all_labels = []

    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']#.to(device)
            labels = batch['labels']#.to(device)
            outputs = model(input_ids)
            outputs = torch.sigmoid(outputs)
            all_preds.append(outputs.numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.concatenate(all_preds, axis=0).T
    all_labels = np.concatenate(all_labels, axis=0).T

    roc_auc = [roc_auc_score(y_true, y_pred) for y_true, y_pred in zip(all_labels, all_preds)]

    return np.mean(roc_auc)

In [64]:
train_score = evaluate_model(train_dataloader, model)

In [65]:
val_score = evaluate_model(val_dataloader, model)

In [66]:
test_score = evaluate_model(test_dataloader, model)

In [67]:
print(f"Train set - ROC AUC score: {train_score}")
print(f"Val set --- ROC AUC score: {val_score}")
print(f"Test set -- ROC AUC score: {test_score}")

Train set - ROC AUC score: 0.9817554410249442
Val set --- ROC AUC score: 0.9767250982210864
Test set -- ROC AUC score: 0.9746421629415304


# Kaggle test

In [94]:
kaggle_test_text = pd.read_csv(Path("..", "data", "kaggle_test", "test.csv"))
# Add columns because it's needed for the "ToxicDataset" class,
# the "DataLoader" and "evaluate_model()" function
labels_columns = pd.DataFrame(
    0.5,
    index=kaggle_test_text.index,
    columns=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
kaggle_test_data = pd.concat([kaggle_test_text, labels_columns], axis=1)

kaggle_test_input = kaggle_test_data.comment_text.to_list()
kaggle_test_labels = kaggle_test_data.loc[:,  ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()

kaggle_test_dataset = ToxicDataset(kaggle_test_input, kaggle_test_labels, vocab, config["max_len"])
kaggle_test_dataloader = DataLoader(kaggle_test_dataset, batch_size=64, shuffle=False)

In [96]:
all_preds = []

model.eval()  # Set model to evaluation mode
with torch.no_grad():
    for batch in kaggle_test_dataloader:
        input_ids = batch['input_ids']
        outputs = model(input_ids)
        outputs = torch.sigmoid(outputs)
        all_preds.append(outputs.numpy())
all_preds = np.concatenate(all_preds, axis=0).T

In [None]:
# kaggle_pred_labels = pd.DataFrame((all_preds.T > 0.5).astype(int))
kaggle_pred_labels = pd.DataFrame(all_preds.T)
kaggle_test_output = pd.concat([kaggle_test_text, kaggle_pred_labels], axis=1).drop(columns=["comment_text"])
kaggle_test_output.columns = ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
kaggle_test_output.to_csv(Path("..", "data", "kaggle_test", "submission.csv"), index=False)

# Predict

In [69]:
def predict(text, vocab, model):
    input = tokenizer(text, vocab, max_len=config["max_len"])
    # Make prediction
    output = model(input)
    prediction = torch.sigmoid(output).detach().numpy()
    print(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
    print(prediction[0])

In [32]:
predict(
    'screw you\nwhy dont you stick it up your fucking ass than lick it out, block it i dont give a shit you fucking bastard, suck my fucking BALLLLLSSSSSSS!!!!!!!!!!!!!!!',
    vocab,
    model
)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
[0.9847456  0.2695209  0.95576817 0.03618464 0.88734657 0.19170015]


In [33]:
predict(
    "I hate you",
    vocab,
    model
)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
[0.82117516 0.0164214  0.28932157 0.01511631 0.36922097 0.04858212]


In [34]:
predict(
    "I love you",
    vocab,
    model
)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
[0.21111995 0.00212456 0.02372743 0.00355181 0.04209996 0.0078772 ]
