In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import  word_tokenize
import re
import torch
from torch.nn.utils.rnn import pad_sequence
from torch import nn,optim
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

In [3]:
# importing the data
train_path = r"C:\Ashvin\AI ML\Project\Comment Toxicity\train.csv"
test_path = r"C:\Ashvin\AI ML\Project\Comment Toxicity\test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [4]:
#check for null values
train_df.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [5]:
#check for null values
test_df.isna().sum()

id              0
comment_text    0
dtype: int64

In [6]:
train_df.shape

(159571, 8)

In [7]:
test_df.shape

(153164, 2)

In [None]:
#dropping the ID column
train_df.drop(['id'],axis=1,inplace=True)
test_df.drop(['id'],axis=1,inplace=True)

In [9]:
train_df

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [10]:
# Text Pre processing 
# Part 1. Stopwords removal, cleaning text, tokenization
sw = stopwords.words("english")

lemmatizer = WordNetLemmatizer()

def get_clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if (t.isalpha()) and (t not in sw)]
    tokens = [lemmatizer.lemmatize(t) for t in tokens if len(t)>=4]
    return tokens

#applying the clean text on it
train_df['cleaned'] = train_df['comment_text'].apply(get_clean_text)
test_df['cleaned'] = test_df['comment_text'].apply(get_clean_text)

In [11]:
# Part 2. Word2idx , vector , padding
# Create word to index --> convert word into numeric IDs so it can be used as input to newural network

X_train = train_df['cleaned']
X_test = test_df['cleaned']

word2idx = {"<PAD>":0,"<UNK>":1}
idx = 2
for tokens in X_train:
    for word in tokens:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

#Vectorization -> Turning numeric ids to vectors
train_vector = []

for sent in X_train.values:
    inner_vector = []
    for word in sent:
        inner_vector.append(word2idx.get(word,word2idx["<UNK>"]))
    train_vector.append(torch.tensor(inner_vector))

test_vector = []

for sent in X_test.values:
    inner_vector = []
    for word in sent:
        inner_vector.append(word2idx.get(word,word2idx["<UNK>"]))
    test_vector.append(torch.tensor(inner_vector))

#Padding --> ensures all the tensors have equal lengths 
train_padded = pad_sequence(train_vector,batch_first=True,padding_value=0)
test_padded = pad_sequence(test_vector,batch_first=True,padding_value=0)

# convert the remaining columns to torch and defining the target that these are the output columns
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_labels = torch.tensor(train_df[label_cols].values, dtype=torch.float32)

In [13]:
print(train_labels.shape)
print(train_padded.shape)


torch.Size([159571, 6])
torch.Size([159571, 1000])


In [None]:
# Creating aarchitecture 

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from torch.utils.data import DataLoader, TensorDataset


class BiLSTM(nn.Module):
    def __init__(self,vocab_size,emb_dim,hidden_dim,output_dim):
        super(BiLSTM,self).__init__()
        self.embedding = nn.Embedding(vocab_size,emb_dim,padding_idx=0)
        self.lstm = nn.LSTM(emb_dim,hidden_dim,bidirectional=True,batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim*2,hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim,output_dim),
            nn.Sigmoid()
        )
    def forward(self,X):
        X = self.embedding(X)
        output,(hidden,cell_state) = self.lstm(X)
        hidden_forward = hidden[-2, :, :]
        hidden_backward = hidden[-1, :, :]
        concatenated = torch.cat((hidden_forward,hidden_backward),dim=1)
        output = self.classifier(concatenated)
        return output 

class TextCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, output_dim, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (k, emb_dim)) for k in kernel_sizes])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)  # (batch_size, seq_len, emb_dim)
        x = x.unsqueeze(1)     # (batch_size, 1, seq_len, emb_dim)
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max(pool, dim=2)[0] for pool in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return self.sigmoid(self.fc(x))

vocab_size = len(word2idx)
emb_dim = 64
hidden_dim = 32
output_dim = len(label_cols)

models = [BiLSTM(vocab_size,emb_dim,hidden_dim,output_dim),
          TextCNN(vocab_size, emb_dim, output_dim)]

input_data = train_padded
labels = train_labels

batch_size = 64

# Create dataset and loader
dataset = TensorDataset(input_data, labels)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

epochs = 1

for model in models:
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    print(f"\nTraining model: {type(model).__name__}")
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            preds = model(batch_x)
            loss = criterion(preds, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(train_loader):.4f}")


    model.eval()
    with torch.no_grad():
        prediction = model(input_data)
        prediction = (prediction>0.5).float()
        print(f"Accuracy: {accuracy_score(labels.numpy(),prediction.numpy())}")
        print(f"Precision: {precision_score(labels.numpy(),prediction.numpy(),average='macro', zero_division=0)}")
        print(f"Recall : {recall_score(labels.numpy(),prediction.numpy(),average='macro', zero_division=0)}")
        print(f"F1 Score: {f1_score(labels.numpy(),prediction.numpy(),average='macro', zero_division=0)}")
    model_path = f"{type(model).__name__}_model.pth"
    torch.save(model.state_dict(),model_path)


In [None]:
# Final Output - combining all the above codes

import pandas as pd
import nltk
import re
import torch
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import  word_tokenize
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset,DataLoader
from torch import nn,optim
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# importing the data
train_path = r"C:\Ashvin\AI ML\Project\Comment Toxicity\train.csv"
test_path = r"C:\Ashvin\AI ML\Project\Comment Toxicity\test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df.drop(['id'],axis=1,inplace=True)
test_df.drop(['id'],axis=1,inplace=True)

sw = stopwords.words("english")

lemmatizer = WordNetLemmatizer()

def get_clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if (t.isalpha()) and (t not in sw)]
    tokens = [lemmatizer.lemmatize(t) for t in tokens if len(t)>=4]
    return tokens

# applying the clean text on it
train_df['cleaned'] = train_df['comment_text'].apply(get_clean_text)
test_df['cleaned'] = test_df['comment_text'].apply(get_clean_text)


# Part 2. Word2idx , vector , padding
# Create word to index --> convert word into numeric IDs so it can be used as input to newural network

X_train = train_df['cleaned']
X_test = test_df['cleaned']

word2idx = {"<PAD>":0,"<UNK>":1}
idx = 2
for tokens in X_train:
    for word in tokens:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1


# Vectorization -> Turning numeric ids to vectors
train_vector = []

for sent in X_train.values:
    inner_vector = []
    for word in sent:
        inner_vector.append(word2idx.get(word,word2idx["<UNK>"]))
    train_vector.append(torch.tensor(inner_vector))

test_vector = []

for sent in X_test.values:
    inner_vector = []
    for word in sent:
        inner_vector.append(word2idx.get(word,word2idx["<UNK>"]))
    test_vector.append(torch.tensor(inner_vector))

# Padding --> ensures all the tensors have equal lengths 
train_padded = pad_sequence(train_vector,batch_first=True,padding_value=0)

# convert the remaining columns to torch and defining the target that these are the output columns
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_labels = torch.tensor(train_df[label_cols].values, dtype=torch.float32)

# Pos weight for imbalance
# Compute pos_weight = (num_neg / num_pos) per class
num_pos = train_df[label_cols].sum(axis=0).values
num_neg = len(train_df) - num_pos
pos_weight = torch.tensor(num_neg / (num_pos+1e-5), dtype=torch.float32).to(device)

# Create dataset and loader
batch_size = 32
dataset = TensorDataset(train_padded, train_labels)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Architecture
class BiLSTM(nn.Module):
    def __init__(self,vocab_size,emb_dim,hidden_dim,output_dim):
        super(BiLSTM,self).__init__()
        self.embedding = nn.Embedding(vocab_size,emb_dim,padding_idx=0)
        self.lstm = nn.LSTM(emb_dim,hidden_dim,bidirectional=True,batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim*2,hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim,output_dim),
        )
    def forward(self,X):
        X = self.embedding(X)
        output,(hidden,cell_state) = self.lstm(X)
        hidden_forward = hidden[-2, :, :]
        hidden_backward = hidden[-1, :, :]
        concatenated = torch.cat((hidden_forward,hidden_backward),dim=1)
        output = self.classifier(concatenated)
        return output 

class TextCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, output_dim, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (k, emb_dim)) for k in kernel_sizes])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, output_dim)

    def forward(self, x):
        x = self.embedding(x)  # (batch_size, seq_len, emb_dim)
        x = x.unsqueeze(1)     # (batch_size, 1, seq_len, emb_dim)
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max(pool, dim=2)[0] for pool in x]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return self.fc(x)

# Train & evaluate
vocab_size = len(word2idx)
emb_dim =  192
hidden_dim = 96
output_dim = len(label_cols)

models = [BiLSTM(vocab_size,emb_dim,hidden_dim,output_dim).to(device),
          TextCNN(vocab_size, emb_dim, output_dim).to(device)]

input_train = train_padded.to(device)
labels = train_labels.to(device)

epochs = 15

for model in models:
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=0.0003)

    print(f"\nTraining model: {type(model).__name__}")
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            optimizer.zero_grad()
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(train_loader):.4f}")

# Evaluate in batches to avoid GPU OOM

    model.eval()
    train_eval_loader = DataLoader(TensorDataset(input_train, labels), batch_size=64, shuffle=False)
    all_preds = []
    with torch.no_grad():
        for batch_x, _ in train_eval_loader:
            batch_x = batch_x.to(device)
            logits = model(batch_x)
            probs = torch.sigmoid(logits)
            preds = (probs>0.5).float()
            all_preds.append(preds.cpu())

    pred_train = torch.cat(all_preds, dim=0).numpy()
    labels_cpu = labels.cpu().numpy()

    print(f"Accuracy: {accuracy_score(labels_cpu,pred_train)}")
    print(f"Precision: {precision_score(labels_cpu,pred_train,average='macro', zero_division=0)}")
    print(f"Recall : {recall_score(labels_cpu,pred_train,average='macro', zero_division=0)}")
    print(f"F1 Score: {f1_score(labels_cpu,pred_train,average='macro', zero_division=0)}")

    model_path = f"{type(model).__name__}_model.pth"
    torch.save(model.state_dict(),model_path)

    metrics = {
        "Accuracy": float(accuracy_score(labels_cpu, pred_train)),
        "Precision": float(precision_score(labels_cpu, pred_train, average='macro', zero_division=0)),
        "Recall": float(recall_score(labels_cpu, pred_train, average='macro', zero_division=0)),
        "F1 Score": float(f1_score(labels_cpu, pred_train, average='macro', zero_division=0))
    }

    metrics_filename = f"{type(model).__name__}_metrics.json"
    with open(metrics_filename, 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"Metrics saved to {metrics_filename}")

In [None]:
# Using device: cuda

# Training model: BiLSTM
# Epoch [1/15], Loss: 0.7409
# Epoch [2/15], Loss: 0.5036
# Epoch [3/15], Loss: 0.3704
# Epoch [4/15], Loss: 0.2945
# Epoch [5/15], Loss: 0.2440
# Epoch [6/15], Loss: 0.2030
# Epoch [7/15], Loss: 0.1729
# Epoch [8/15], Loss: 0.1515
# Epoch [9/15], Loss: 0.1327
# Epoch [10/15], Loss: 0.1374
# Epoch [11/15], Loss: 0.1433
# Epoch [12/15], Loss: 0.1318
# Epoch [13/15], Loss: 0.0987
# Epoch [14/15], Loss: 0.0898
# Epoch [15/15], Loss: 0.0797
# Accuracy: 0.9322746614359753
# Precision: 0.5482897137211845
# Recall : 0.9987014954472268
# F1 Score: 0.6904724204027296
# Metrics saved to BiLSTM_metrics.json

# Training model: TextCNN
# Epoch [1/15], Loss: 0.7480
# Epoch [2/15], Loss: 0.4590
# Epoch [3/15], Loss: 0.3713
# Epoch [4/15], Loss: 0.3197
# Epoch [5/15], Loss: 0.2940
# Epoch [6/15], Loss: 0.2674
# Epoch [7/15], Loss: 0.2510
# Epoch [8/15], Loss: 0.2374
# Epoch [9/15], Loss: 0.2236
# Epoch [10/15], Loss: 0.2117
# Epoch [11/15], Loss: 0.2014
# Epoch [12/15], Loss: 0.1880
# Epoch [13/15], Loss: 0.1838
# Epoch [14/15], Loss: 0.1744
# Epoch [15/15], Loss: 0.1708
# Accuracy: 0.8949495835709496
# Precision: 0.4435334576555454
# Recall : 0.993140785699921
# F1 Score: 0.5797171397327897
# Metrics saved to TextCNN_metrics.json

In [None]:
import pandas as pd
import nltk
import re
import torch
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torch import nn, optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

# -------------------------------
# 1. Setup
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Paths
train_path = r"C:\Ashvin\AI ML\Project\Comment Toxicity\train.csv"
test_path  = r"C:\Ashvin\AI ML\Project\Comment Toxicity\test.csv"

# Load data
train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# Drop 'id' column
train_df.drop(['id'], axis=1, inplace=True)
test_df.drop(['id'], axis=1, inplace=True)

# Use the below download if you do not have it
# NLP tools
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

sw = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# -------------------------------
# 2. Text cleaning
# -------------------------------
def get_clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t.isalpha() and t not in sw and len(t) >= 4]
    return tokens

train_df['cleaned'] = train_df['comment_text'].apply(get_clean_text)
test_df['cleaned']  = test_df['comment_text'].apply(get_clean_text)

# -------------------------------
# 3. Vocabulary and vectorization
# -------------------------------
word2idx = {"<PAD>":0, "<UNK>":1}
idx = 2
for tokens in train_df['cleaned']:
    for word in tokens:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

# Vectorize
# Vectorize & pad
train_vector = []
for sent in train_df['cleaned']:
    inner_vector = [word2idx.get(word, word2idx["<UNK>"]) for word in sent]
    train_vector.append(torch.tensor(inner_vector))

# Padding --> ensures all the tensors have equal lengths 
train_padded = pad_sequence(train_vector, batch_first=True, padding_value=0)

# -------------------------------
# 4. Prepare labels & DataLoader
# -------------------------------
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_labels = torch.tensor(train_df[label_cols].values, dtype=torch.float32)

# Pos weight for imbalance
num_pos = train_df[label_cols].sum(axis=0).values
num_neg = len(train_df) - num_pos
pos_weight = torch.tensor(num_neg / (num_pos+1e-5), dtype=torch.float32).to(device)

batch_size = 32
dataset = TensorDataset(train_padded, train_labels)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# -------------------------------
# 5. Model definitions
# -------------------------------
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, output_dim):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        concat = torch.cat((hidden[-2], hidden[-1]), dim=1)
        return self.classifier(concat)

class TextCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, output_dim, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (k, emb_dim)) for k in kernel_sizes])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(len(kernel_sizes)*num_filters, output_dim)
    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [torch.max(pool, dim=2)[0] for pool in x]
        x = torch.cat(x, dim=1)
        return self.fc(self.dropout(x))

# -------------------------------
# 6️⃣ Train & evaluate
# -------------------------------
vocab_size = len(word2idx)
emb_dim, hidden_dim, output_dim = 192, 96, len(label_cols)
epochs = 15
models = [
    BiLSTM(vocab_size, emb_dim, hidden_dim, output_dim).to(device),
    TextCNN(vocab_size, emb_dim, output_dim).to(device)
]

input_train = train_padded.to(device)
labels = train_labels.to(device)

for model in models:
    print(f"\nTraining model: {type(model).__name__}")
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=0.0003)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}")

    # Evaluation
    model.eval()
    preds_all = []
    eval_loader = DataLoader(TensorDataset(input_train, labels), batch_size=64)
    with torch.no_grad():
        for batch_x, _ in eval_loader:
            batch_x = batch_x.to(device)
            logits = model(batch_x)
            preds = (torch.sigmoid(logits) > 0.5).float()
            preds_all.append(preds.cpu())
    preds = torch.cat(preds_all).numpy()
    labels_np = labels.cpu().numpy()

    acc = accuracy_score(labels_np, preds)
    prec = precision_score(labels_np, preds, average='macro', zero_division=0)
    rec = recall_score(labels_np, preds, average='macro', zero_division=0)
    f1 = f1_score(labels_np, preds, average='macro', zero_division=0)

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall : {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Save
    torch.save(model.state_dict(), f"{type(model).__name__}_model.pth")
    metrics = {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1 Score": f1}
    with open(f"{type(model).__name__}_metrics.json", 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"✅ Metrics saved to {type(model).__name__}_metrics.json")



Using device: cuda

Training model: BiLSTM
Epoch [1/15], Loss: 0.7172
Epoch [2/15], Loss: 0.4442
Epoch [3/15], Loss: 0.3784
Epoch [4/15], Loss: 0.3075
Epoch [5/15], Loss: 0.2808
Epoch [6/15], Loss: 0.2446
Epoch [7/15], Loss: 0.2076
Epoch [8/15], Loss: 0.1731
Epoch [9/15], Loss: 0.1531
Epoch [10/15], Loss: 0.1321
Epoch [11/15], Loss: 0.1159
Epoch [12/15], Loss: 0.1089
Epoch [13/15], Loss: 0.0962
Epoch [14/15], Loss: 0.0873
Epoch [15/15], Loss: 0.0859
Accuracy: 0.9231
Precision: 0.5578
Recall : 0.9975
F1 Score: 0.7027
✅ Metrics saved to BiLSTM_metrics.json

Training model: TextCNN
Epoch [1/15], Loss: 0.7521
Epoch [2/15], Loss: 0.4571
Epoch [3/15], Loss: 0.3693
Epoch [4/15], Loss: 0.3204
Epoch [5/15], Loss: 0.2891
Epoch [6/15], Loss: 0.2683
Epoch [7/15], Loss: 0.2547
Epoch [8/15], Loss: 0.2423
Epoch [9/15], Loss: 0.2205
Epoch [10/15], Loss: 0.2124
Epoch [11/15], Loss: 0.1973
Epoch [12/15], Loss: 0.1953
Epoch [13/15], Loss: 0.1879
Epoch [14/15], Loss: 0.1783
Epoch [15/15], Loss: 0.1674
Acc

In [None]:
# Saving the word2idx
import json

with open("word2idx.json", "w") as f:
    json.dump(word2idx, f)


In [None]:
# Loading the saved word2idx file for predicting the test data
import json
with open('word2idx.json') as f:
    word2idx = json.load(f)

In [None]:
#Predicting the test data using the trained model

import pandas as pd
import torch
from torch.utils.data import DataLoader

# Make sure your model classes are already defined:
# TextCNN and BiLSTM (with same hidden_dim etc.)
# And also that you have test_padded, test_df, word2idx, etc.

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

test_vector = []
for sent in test_df['cleaned']:
    inner_vector = [word2idx.get(word, word2idx["<UNK>"]) for word in sent]
    test_vector.append(torch.tensor(inner_vector))

test_padded = pad_sequence(test_vector, batch_first=True, padding_value=0)

# Model hyperparameters must MATCH training
emb_dim = 192
hidden_dim = 96
output_dim = 6  # since label_cols length is 6
vocab_size = len(word2idx)  # should be loaded from the Source code 
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Prediction + save function
def predict_and_save(model_class, model_name, model_file, test_padded, test_df, vocab_size, emb_dim, output_dim, label_cols):
    # Step 1: DataLoader
    test_loader = DataLoader(test_padded, batch_size=64, shuffle=False)
    
    # Step 2: Load model
    model = model_class(vocab_size, emb_dim, output_dim).to(device)
    model.load_state_dict(torch.load(model_file, map_location=device))
    model.eval()

    # Step 3: Predict
    all_preds = []
    with torch.no_grad():
        for batch_x in test_loader:
            batch_x = batch_x.to(device)
            logits = model(batch_x)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()  # convert to 0/1
            all_preds.append(preds.cpu())

    # Step 4: Combine predictions
    pred_test = torch.cat(all_preds, dim=0).numpy()

    # Step 5: DataFrame + save
    pred_df = pd.DataFrame(pred_test, columns=label_cols).astype(int)
    output_df = pd.concat([test_df['comment_text'].reset_index(drop=True), pred_df], axis=1)

    filename = f"{model_name}_test_predictions.csv"
    output_df.to_csv(filename, index=False)
    print(f"✅ Saved predictions to {filename}")

# --- Run for TextCNN ---
predict_and_save(
    model_class=TextCNN,
    model_name="TextCNN",
    model_file="TextCNN_model.pth",
    test_padded=test_padded,
    test_df=test_df,
    vocab_size=vocab_size,
    emb_dim=emb_dim,
    output_dim=output_dim,
    label_cols=label_cols
)

# --- Run for BiLSTM ---
predict_and_save(
    model_class=lambda vocab_size, emb_dim, output_dim: BiLSTM(vocab_size, emb_dim, hidden_dim=hidden_dim, output_dim=output_dim),
    model_name="BiLSTM",
    model_file="BiLSTM_model.pth",
    test_padded=test_padded,
    test_df=test_df,
    vocab_size=vocab_size,
    emb_dim=emb_dim,
    output_dim=output_dim,
    label_cols=label_cols
)


Using device: cuda
✅ Saved predictions to TextCNN_test_predictions.csv
✅ Saved predictions to BiLSTM_test_predictions.csv
