In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/train (1).csv')

In [None]:
# Show shape and first few rows
print("Shape:", df.shape)
df.head()

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount= True)

In [None]:
df.head(2)

In [None]:
df.shape

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Check label distribution
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(df[label_cols].sum())

 ## Takeaways:

Highly imbalanced data — majority of comments are non-toxic (approx 90%)

The ‘threat’ and ‘identity_hate’ categories are especially rare

Toxic is the most common toxic label and will be good as a starting point if doing binary classification first (toxic vs. not)

In [None]:
# Example of a toxic comment
print("\nSample toxic comment:")
print(df[df['toxic'] == 1]['comment_text'].values[0])

In [None]:
import nltk
import re
import string
from nltk.corpus import stopwords
nltk.download('stopwords')


In [None]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove line breaks and tabs
    text = re.sub(r'\n|\r|\t', ' ', text)
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove user mentions and hashtags
    text = re.sub(r'\@\w+|\#','', text)
    # Remove punctuations and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in STOPWORDS]
    return " ".join(tokens)


In [None]:
df['cleaned_comment'] = df['comment_text'].apply(clean_text)

# Show before & after
for i in range(2):
    print("\nOriginal:", df['comment_text'][i])
    print("Cleaned :", df['cleaned_comment'][i])


In [None]:
df.head()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Parameters
MAX_NUM_WORDS = 20000  # Only top 20,000 words will be kept
MAX_SEQUENCE_LENGTH = 100  # Pad all sequences to 100 words

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['cleaned_comment'])

In [None]:
# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['cleaned_comment'])

In [None]:
# Pad sequences
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print("Tokenized and padded shape:", X.shape)

In [None]:
df.head(2)

## Define the Target Variable
Start with binary classification (toxic vs. non-toxic), then extend to multi-label later.

In [None]:
y = df['toxic'].values  # Binary target


##  Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Validation shape:", X_val.shape)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
VOCAB_SIZE = 20000  # Same as MAX_NUM_WORDS
EMBEDDING_DIM = 128
SEQUENCE_LENGTH = 100

model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=SEQUENCE_LENGTH),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
model.summary()


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

print("Class weights:", class_weight_dict)


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=128,
    validation_data=(X_val, y_val),
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)


In [None]:
loss, acc = model.evaluate(X_val, y_val)
print(f"\nValidation Accuracy: {acc:.4f}")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict probabilities
y_pred_probs = model.predict(X_val)

# Convert to binary labels
y_pred = (y_pred_probs > 0.5).astype(int)


In [None]:
print("Classification Report:\n")
print(classification_report(y_val, y_pred, target_names=["Non-Toxic", "Toxic"]))


## Key Insights:
✅ High recall for toxic (0.87): The model catches most toxic comments

⚠️ Moderate precision for toxic (0.60): It sometimes wrongly flags clean comments

✅ Excellent precision for non-toxic (0.98): It rarely mislabels clean comments

📈 Weighted F1-score of 0.94 shows a balanced overall performance

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_val, y_pred)

plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=["Non-Toxic", "Toxic"],
            yticklabels=["Non-Toxic", "Toxic"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
df.to_csv("cleaned_data.csv", index=False)


In [None]:
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
np.save('X_train.npy', X_train)
np.save('X_val.npy', X_val)
np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)


In [None]:
model.save('toxic_comment_lstm_model.h5')  # Saves model architecture + weights


In [None]:
model.save("toxic_comment_lstm_model.keras")  # Native Keras format


In [None]:
# Load libraries
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
import pickle

# Load data
df = pd.read_csv("cleaned_data.csv")
X_train = np.load("X_train.npy")
X_val = np.load("X_val.npy")
y_train = np.load("y_train.npy")
y_val = np.load("y_val.npy")

# Load tokenizer
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Load model
model = load_model("toxic_comment_lstm_model.keras")


In [None]:
X = df['comment_text'].values
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

VOCAB_SIZE = 20000
MAX_LEN = 150

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN, padding='post', truncating='post')


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.3, random_state=42)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=128, input_length=MAX_LEN),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(6, activation='sigmoid')  # 6 outputs
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [None]:
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_val, y_val))


In [None]:
from sklearn.metrics import classification_report

y_pred_prob = model.predict(X_val)
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_val, y_pred, target_names=[
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
]))


In [None]:
# Save tokenizer
with open("tokenizer_multilabel.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save model
model.save("model_multilabel_lstm.keras")


In [None]:
# Load tokenizer
with open('tokenizer_multilabel.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Load model
model = load_model("model_multilabel_lstm.keras")

In [None]:
np.save("y_train_multilabel.npy", y_train)
np.save("y_val_multilabel.npy", y_val)


### Load Properly Later

In [None]:
y_val = np.load("y_val_multilabel.npy")


In [None]:
from sklearn.metrics import roc_auc_score

# y_val: true multilabel values
# y_pred_prob: model probabilities (not thresholded)

for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    try:
        auc = roc_auc_score(y_val[:, i], y_pred_prob[:, i])
        print(f"{label}: ROC-AUC = {auc:.4f}")
    except ValueError:
        print(f"{label}: Not enough positive samples to compute AUC.")


| Label              | ROC-AUC | Interpretation                            |
| ------------------ | ------- | ----------------------------------------- |
| **toxic**          | 0.97    | 🟢 Excellent discrimination               |
| **severe\_toxic**  | 0.99    | 🟢 Almost perfect (even with low recall!) |
| **obscene**        | 0.99    | 🟢 Excellent                              |
| **threat**         | 0.95    | 🟢 Strong separation (rare class!)        |
| **insult**         | 0.98    | 🟢 Excellent                              |
| **identity\_hate** | 0.96    | 🟢 Very good (despite F1 = 0)             |



# CNN Architecture (Text Classification Style)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

VOCAB_SIZE = 20000
MAX_LEN = 150
EMBEDDING_DIM = 128

model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(6, activation='sigmoid')  # Multilabel output
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [None]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=128,
    validation_data=(X_val, y_val)
)


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred_probs = model.predict(X_val)
y_pred = (y_pred_probs > 0.5).astype(int)

In [None]:
# Classification report
print(classification_report(
    y_val,
    y_pred,
    target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
))

In [None]:
# ROC-AUC
for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    try:
        auc = roc_auc_score(y_val[:, i], y_pred_probs[:, i])
        print(f"{label}: ROC-AUC = {auc:.4f}")
    except:
        print(f"{label}: Not enough positive samples")

| Label          | F1-Score (CNN) | F1-Score (LSTM) | ROC-AUC (CNN) | ROC-AUC (LSTM) |
| -------------- | -------------- | --------------- | ------------- | -------------- |
| toxic          | **0.77**       | 0.78            | 0.9592        | 0.9697         |
| severe\_toxic  | **0.37**       | 0.09            | 0.9865        | 0.9884         |
| obscene        | **0.78**       | 0.77            | 0.9852        | 0.9866         |
| threat         | 0.00           | 0.00            | 0.9578        | 0.9484         |
| insult         | **0.70**       | 0.67            | 0.9769        | 0.9800         |
| identity\_hate | 0.00           | 0.00            | 0.9468        | 0.9583         |


🔹 CNN slightly outperforms LSTM in:
severe_toxic (much better F1!)

obscene and insult (small gain)

🔹 Both models struggle with:
threat

identity_hate

(but ROC-AUC shows the model can rank those correctly — just not thresholded well)

## We experimented with LSTM and CNN. CNN showed better recall and F1 in more toxic categories

In [None]:
df.head(2)

##  Multilabel BERT-based toxicity detector using 🤗 Hugging Face + Pytorch

In [None]:
import pandas as pd

df = pd.read_csv("/content/train (1).csv")
df['comment_text'] = df['comment_text'].astype(str)

In [None]:
df.head()

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer

# Prepare dataset
df['labels'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()
hf_dataset = Dataset.from_pandas(df[['comment_text', 'labels']])

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example['comment_text'], truncation=True, padding='max_length', max_length=128)

# Tokenize and store in tokenized_dataset
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)


In [None]:
split_data = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_hf = split_data['train']
val_hf = split_data['test']


In [None]:
import torch
from torch.utils.data import Dataset

class ToxicCommentsDataset(Dataset):
    def __init__(self, hf_dataset):
        self.input_ids = torch.tensor(hf_dataset['input_ids'])
        self.attention_mask = torch.tensor(hf_dataset['attention_mask'])
        self.labels = torch.tensor(hf_dataset['labels'], dtype=torch.float32)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }


In [None]:
from torch.utils.data import DataLoader

train_ds = ToxicCommentsDataset(train_hf)
val_ds = ToxicCommentsDataset(val_hf)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)


## First Epoch

In [None]:
for batch in train_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    optimizer.zero_grad()
    logits = model(input_ids=input_ids, attention_mask=attention_mask)
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()


In [None]:
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        all_preds.append(probs)
        all_labels.append(labels.cpu().numpy())

# Convert to arrays
y_true = np.vstack(all_labels)
y_pred = np.vstack(all_preds)
y_pred_bin = (y_pred > 0.5).astype(int)

# Classification report
print("\n📋 Classification Report:\n")
print(classification_report(y_true, y_pred_bin, target_names=[
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
]))

# ROC-AUC for each label
print("\n📈 ROC-AUC Scores:")
for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    score = roc_auc_score(y_true[:, i], y_pred[:, i])
    print(f"{label}: {score:.4f}")


In [None]:
# Save model
torch.save(model.state_dict(), "bert_multilabel_epoch1.pth")

# Save tokenizer
tokenizer.save_pretrained("bert_tokenizer/")


In [None]:
import os

# Make the folder
os.makedirs("bert_tokenizer", exist_ok=True)

# Move uploaded files into it
import shutil
shutil.move("vocab.txt", "bert_tokenizer/vocab.txt")
shutil.move("tokenizer_config.json", "bert_tokenizer/tokenizer_config.json")
shutil.move("special_tokens_map.json", "bert_tokenizer/special_tokens_map.json")


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert_tokenizer/")


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertMultilabelClassifier(nn.Module):
    def __init__(self, num_labels=6):
        super(BertMultilabelClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertMultilabelClassifier(num_labels=6)
model.load_state_dict(torch.load("bert_multilabel_epoch1.pth", map_location=device))
model.to(device)
model.eval()


In [None]:
def predict_comment(text):
    encoded = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )

    # Remove token_type_ids if exists
    encoded.pop("token_type_ids", None)

    # Move tensors to device
    inputs = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        logits = model(**inputs)
        probs = torch.sigmoid(logits).cpu().numpy()[0]

    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    return dict(zip(labels, probs.round(3)))


In [None]:
predict_comment("You are a disgusting liar and a disgrace!")


In [None]:
model = BertMultilabelClassifier(num_labels=6)
model.load_state_dict(torch.load("bert_multilabel_epoch1.pth", map_location=device))
model.to(device)
model.train()


In [None]:
model = BertMultilabelClassifier(num_labels=6)
model.load_state_dict(torch.load("bert_multilabel_epoch1.pth", map_location=device))
model.to(device)

from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

import torch.nn as nn
criterion = nn.BCEWithLogitsLoss()


In [None]:
num_epochs = 3  # total epochs
start_epoch = 1  # since we've already done epoch 1

for epoch in range(start_epoch, num_epochs):
    print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0

    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"📉 Training Loss (epoch {epoch + 1}): {avg_loss:.4f}")

    # ✅ Save model after each epoch
    model_filename = f"bert_multilabel_epoch{epoch + 1}.pth"
    torch.save(model.state_dict(), model_filename)
    print(f"💾 Saved model: {model_filename}")


In [None]:
# Load model from epoch 2
model.load_state_dict(torch.load("bert_multilabel_epoch2.pth"))
model.eval()


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# Make predictions
all_preds = []
all_labels = []

model.eval()
for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].cpu().numpy()
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()

    all_preds.append(probs)
    all_labels.append(labels)

# Stack all predictions
import numpy as np
y_true = np.vstack(all_labels)
y_pred = np.vstack(all_preds)

# Classification report
y_pred_bin = (y_pred > 0.5).astype(int)
print(classification_report(y_true, y_pred_bin, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# ROC-AUC scores
for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    score = roc_auc_score(y_true[:, i], y_pred[:, i])
    print(f"{label}: ROC-AUC = {score:.4f}")


In [None]:
# Load model from epoch 3
model.load_state_dict(torch.load("bert_multilabel_epoch3.pth"))
model.eval()


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# Make predictions
all_preds = []
all_labels = []

model.eval()
for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].cpu().numpy()
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()

    all_preds.append(probs)
    all_labels.append(labels)

# Stack all predictions
import numpy as np
y_true = np.vstack(all_labels)
y_pred = np.vstack(all_preds)

# Classification report
y_pred_bin = (y_pred > 0.5).astype(int)
print(classification_report(y_true, y_pred_bin, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# ROC-AUC scores
for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    score = roc_auc_score(y_true[:, i], y_pred[:, i])
    print(f"{label}: ROC-AUC = {score:.4f}")


In [None]:
# Rename epoch 3 model for final use
!mv bert_multilabel_epoch3.pth bert_multilabel_best.pth


In [None]:
torch.save(model.state_dict(), "bert_multilabel_best.pth")


# Log Final Report

The final model was selected from epoch 3, as it showed improved recall and F1-score on minority classes like severe_toxic and threat, while maintaining strong overall performance. The macro F1-score increased to 0.67 and ROC-AUC scores remained consistently above 0.98 for all labels

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [None]:
import json

with open("label_list.json", "w") as f:
    json.dump(labels, f)


In [None]:
training_config = {
    "model_name": "bert-base-uncased",
    "max_len": 128,
    "batch_size": 32,
    "learning_rate": 2e-5,
    "epochs": 3,
}
with open("training_config.json", "w") as f:
    json.dump(training_config, f)
