In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import BertTokenizer, BertModel
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

PREPROCESSING STEPS


In [43]:
dataset = pd.read_csv('traindata.csv')
dropped = ['processed', 'offensiveness_score']
rename = {'txt': 'comment', 'isOffensive': 'label'}
dataset = dataset.drop(columns=dropped)
dataset = dataset.rename(columns=rename)
dataset.head()

Unnamed: 0,comment,label
0,> The difference in average earnings between m...,0
1,"The myth is that the ""gap"" is entirely based o...",0
2,The assertion is that women get paid less for ...,0
3,You said in the OP that's not what they're mea...,0
4,>Men and women are not payed less for the same...,0


In [44]:
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [45]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

PREPROCESSING

In [46]:
def preprocess_comments(comment):
    # Tokenize the comment
    tokens = tokenizer.tokenize(comment)

    # Add the [CLS] and [SEP] tokens
    tokens = ['[CLS]'] + tokens + ['[SEP]']

    # Convert tokens to token IDs
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Pad or truncate the token IDs to a fixed length
    max_length = 128
    padded_token_ids = token_ids[:max_length] + [0] * (max_length - len(token_ids[:max_length]))

    return padded_token_ids

# Preprocess the train set
train_data['input_ids'] = train_data['comment'].apply(preprocess_comments)

# Preprocess the test set
test_data['input_ids'] = test_data['comment'].apply(preprocess_comments)

# Convert the labels to numeric values (0 for non-toxic, 1 for toxic)
train_data['label'] = train_data['label'].apply(lambda x: 1 if x == 'toxic' else 0)
test_data['label'] = test_data['label'].apply(lambda x: 1 if x == 'toxic' else 0)

# Convert the preprocessed data into torch tensors
train_input_ids = torch.tensor(train_data['input_ids'].tolist())
train_labels = torch.tensor(train_data['label'].tolist())

test_input_ids = torch.tensor(test_data['input_ids'].tolist())
test_labels = torch.tensor(test_data['label'].tolist())

# Create a TensorDataset
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_labels)

# Define batch size and create data loaders
batch_size = 32
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

BERT EMBEDDINGS


In [47]:
def generate_embeddings(comment):
    tokens = tokenizer.encode(comment, add_special_tokens=True)
    tokens_tensor = torch.tensor([tokens])
    model = BertModel.from_pretrained('bert-base-uncased')
    with torch.no_grad():
        outputs = model(tokens_tensor)
        embeddings = outputs.last_hidden_state
    return embeddings.squeeze(0)

USING ADVERSARIAL TRAINING - DISCRIMINATOR AND VAE MODEL


In [56]:
class VAE(nn.Module):
    def __init__(self, hidden_size, latent_size):
        super(VAE, self).__init__()
        self.hidden_size = hidden_size
        self.latent_size = latent_size

        self.encoder = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Linear(256, latent_size * 2)  # Output mean and log-variance
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_size, 256),
            nn.ReLU(),
            nn.Linear(256, hidden_size),
            nn.Sigmoid()
        )

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        z = mean + eps * std
        return z

    def encode(self, x):
        x = x.view(-1, self.hidden_size)
        hidden = self.encoder(x)
        mean, logvar = hidden[:, :self.latent_size], hidden[:, self.latent_size:]
        z = self.reparameterize(mean, logvar)
        return z, mean, logvar

    def decode(self, z):
        x_hat = self.decoder(z)
        return x_hat

    def forward(self, x):
        z, mean, logvar = self.encode(x)
        x_hat = self.decode(z)
        return x_hat, mean, logvar

class Discriminator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

# Define the hyperparameters for the discriminator
input_size = 128
hidden_size = 128
output_size = 1

# Create an instance of the discriminator
discriminator = Discriminator(input_size, hidden_size, output_size)
vae = VAE(hidden_size, latent_size=32)
# Define the loss function for the discriminator
adversarial_loss = nn.BCELoss()

# Define the optimizer for the discriminator
discriminator_optimizer = optim.Adam(discriminator.parameters(), lr=0.001)
vae_optimizer = optim.Adam(vae.parameters(), lr=0.001)


BUILDING MODEL