# 0. Install Dependencies and Bring in Data

In [2]:
# !pip install torch

In [3]:
import os
import pandas as pd
import numpy as np
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split



In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
df = pd.read_csv(os.path.join(r'C:\Users\Epein\Desktop\CommentToxicity-main\jigsaw-toxic-comment-classification-challenge\train.csv', 'train.csv'))
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# 1. Preprocess

In [6]:
X = df['comment_text'].values
y = df[df.columns[2:]].values

In [7]:
MAX_FEATURES = 200000 # number of words in the vocab

In [8]:
tokenizer = get_tokenizer("basic_english")

In [9]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

In [10]:
iterator = yield_tokens(X)

In [11]:
from collections import Counter
from torchtext.vocab import Vocab
from tqdm import tqdm

In [12]:
vocab = build_vocab_from_iterator(yield_tokens(X), specials=["<unk>"], max_tokens=200000)
vocab.set_default_index(vocab["<unk>"])

In [13]:
import torchtext

print(f"TorchText Version: {torchtext.__version__}")


TorchText Version: 0.18.0


In [14]:
MAX_SEQ_LENGTH = 1800

In [15]:
def numericalize_and_pad(text):
    tokens = tokenizer(text)
    token_indices = [vocab[token] for token in tokens]
    if len(token_indices) < MAX_SEQ_LENGTH:
        token_indices += [0] * (MAX_SEQ_LENGTH - len(token_indices))
    return token_indices[:MAX_SEQ_LENGTH]

In [16]:
vectorized_text = [numericalize_and_pad(text) for text in X]
vectorized_text = torch.tensor(vectorized_text)
y = torch.tensor(y, dtype=torch.float32)

In [17]:
dataset = TensorDataset(vectorized_text, y)

In [18]:
train_size = int(0.7 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [19]:
BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 2. Create Sequential Model

In [20]:
import torch.nn as nn
import torch.optim as optim

In [21]:
class ToxicCommentModel(nn.Module):
    def __init__(self, vocab_size, embed_size, lstm_size, output_size):
        super(ToxicCommentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, lstm_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(lstm_size * 2, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, output_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = torch.relu(self.fc1(x[:, -1, :]))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        return x


In [22]:
MAX_FEATURES = 200000
EMBED_SIZE = 32
LSTM_SIZE = 32
OUTPUT_SIZE = y.shape[1]

model = ToxicCommentModel(vocab_size=MAX_FEATURES + 1, embed_size=EMBED_SIZE, lstm_size=LSTM_SIZE, output_size=OUTPUT_SIZE)

In [23]:
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
def evaluate_model(model, val_loader):
    """
    This function performs model evaluation on the validation data.

    Args:
        model: The PyTorch model to be evaluated.
        val_loader: The data loader for the validation set.

    Returns:
        The average loss of the model on the validation data.
    """
    model.eval()  # Set model to evaluation mode
    total_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation during validation
        for batch_X, batch_y in val_loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
    return total_loss / len(val_loader)


In [None]:
from tqdm import tqdm  # Install tqdm package if not installed

num_epochs = 50  # Change this to a higher number for better training
patience = 5  # Number of epochs to wait for improvement

model.train()
best_val_loss = float('inf')  # Initialize best validation loss to positive infinity

for epoch in range(num_epochs):
    running_loss = 0.0
    val_loss = 0.0

    loop = tqdm(train_loader, leave=True)  # Initialize tqdm progress bar
    for batch_X, batch_y in loop:
        # Training steps
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Validation step
    val_loss = evaluate_model(model, val_loader)

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        # Reset wait counter if validation loss improves
        wait_count = 0
    else:
        wait_count += 1
        if wait_count >= patience:
            print(f'Early stopping at epoch {epoch + 1} due to no improvement in validation loss for {patience} epochs.')
            break

    # Calculate and print epoch summary in a single line
    avg_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_loss:.4f}, Val Loss: {val_loss:.4f}')


In [None]:
torch.save(model.state_dict(), 'model.pth')

# 3. Make Predictions

In [None]:
# Function to make predictions
def predict(text):
    model.eval()
    vectorized_text = numericalize_and_pad(text)
    vectorized_text = torch.tensor(vectorized_text).unsqueeze(0)
    with torch.no_grad():
        output = model(vectorized_text)
    return (output > 0.5).int()

# Example prediction
input_text = 'You freaking suck! I am going to hit you.'
prediction = predict(input_text)
print(prediction)


tensor([[0, 0, 0, 0, 0, 0]], dtype=torch.int32)


# 4. Evaluate Model

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [None]:
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        preds = (outputs > 0.5).int()
        y_true.extend(batch_y.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

y_true = np.array(y_true).flatten()
y_pred = np.array(y_pred).flatten()

precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
accuracy = accuracy_score(y_true, y_pred)

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}')

NameError: name 'np' is not defined