In [None]:
import pandas as pd
import numpy as np
import spacy
import random
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('Jigsaw_toxic_comment_classification.csv')

In [None]:
#converting to binary
df = df.dropna(subset=['comment_text'])
df['is_toxic'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0


In [None]:
# Split into training and validation with sklearn
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["comment_text"], df["is_toxic"], test_size=0.2, random_state=42
)

In [None]:
#Standardized format
train_data = [
    (text, {"cats": {"TOXIC": label, "NOT_TOXIC": not label}})
    for text, label in zip(train_texts, train_labels)
]
len(train_data)

127656

In [None]:
val_data = list(zip(val_texts.tolist(), val_labels.tolist())) #validation data

In [None]:
#define eval function
def evaluate_model(nlp, val_data):
    preds = []
    truths = []
    for text, true_label in val_data:
        doc = nlp(text)
        score = doc.cats["TOXIC"]
        pred_label = score >= 0.5
        preds.append(pred_label)
        truths.append(true_label)
    acc = accuracy_score(truths, preds)
    prec = precision_score(truths, preds)
    rec = recall_score(truths, preds)
    f1 = f1_score(truths, preds)
    return acc, prec, rec, f1

In [None]:
#Create and configure SpaCy pipeline
nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat", last = True)
textcat.add_label("TOXIC")
textcat.add_label("NOT_TOXIC")

1

In [None]:
#Begin training
optimizer = nlp.begin_training()

In [None]:
for epoch in range(5):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        examples = []
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        nlp.update(examples, losses=losses, drop=0.2, sgd=optimizer)
    print(f"Epoch {epoch} Losses: {losses}")

    # Evaluate
    acc, prec, rec, f1 = evaluate_model(nlp, val_data)
    print(f"Epoch {epoch} Validation → Acc: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")

Epoch 0 Losses: {'textcat': 240.44998453153778}
Epoch 0 Validation → Acc: 0.9578 | Precision: 0.9267 | Recall: 0.6353 | F1: 0.7538
Epoch 1 Losses: {'textcat': 170.581704998401}
Epoch 1 Validation → Acc: 0.9620 | Precision: 0.8776 | Recall: 0.7275 | F1: 0.7956
Epoch 2 Losses: {'textcat': 152.66268593579485}
Epoch 2 Validation → Acc: 0.9621 | Precision: 0.8241 | Recall: 0.7975 | F1: 0.8106
Epoch 3 Losses: {'textcat': 143.6819344264939}
Epoch 3 Validation → Acc: 0.9625 | Precision: 0.8360 | Recall: 0.7855 | F1: 0.8099
Epoch 4 Losses: {'textcat': 137.14343163725422}
Epoch 4 Validation → Acc: 0.9635 | Precision: 0.8491 | Recall: 0.7790 | F1: 0.8125


In [None]:
nlp.to_disk("toxic_chat_model_full")

In [None]:
#Code for extracting data from reddit. NOT USED ANYMORE
'''
!pip install praw

# Authenticate
reddit = praw.Reddit(
    client_id="PBqbtbQCnmAY7Y5z1XURiQ",
    client_secret="ICVvf_VaNxirOVhJIapz4o-0rgSf0w",
    username="abdstz",
    password="Timalma1",
    user_agent="toxic-chat-scraper"
)
subreddit = reddit.subreddit('VALORANT')
'''