In [16]:
import gc
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, accuracy_score,f1_score
from transformers import AutoModel
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset , DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class BERT_Arch_CNN(nn.Module):

    def __init__(self):
        super(BERT_Arch_CNN, self).__init__()
#         self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.conv = nn.Conv2d(in_channels=13, out_channels=13, kernel_size=(3, 768), padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=3, stride=1)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(624, 2) # before : 442 with max_length 36 # 806 with max_length 64
        self.flat = nn.Flatten()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, all_layers):
        x = torch.transpose(torch.cat(tuple([t.unsqueeze(0) for t in all_layers]), 0), 0, 1)
        torch.cuda.empty_cache()
        x = self.pool(self.dropout(self.relu(self.conv(self.dropout(x)))))
        x = self.fc(self.dropout(self.flat(self.dropout(x))))
        return x

In [17]:
import datasets

train_dataset = datasets.load_dataset('social_bias_frames',split="train[:12000]")
valid_dataset = datasets.load_dataset('social_bias_frames',split="validation[:7000]")
test_dataset = datasets.load_dataset('social_bias_frames',split="test[:7000]")


# train_df = train_dataset.to_pandas()
# train_df = train_df[train_df['offensiveYN'] != '']
# train_df.loc[train_df['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'

# train_dataset = datasets.Dataset.from_pandas(train_df)

# val_df = valid_dataset.to_pandas()
# val_df = val_df[val_df['offensiveYN'] != '']
# val_df.loc[val_df['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'

# val_dataset = datasets.Dataset.from_pandas(val_df)

# test_df = test_dataset.to_pandas()
# test_df = test_df[test_df['offensiveYN'] != '']
# test_df['offensiveYN'] = test_df['offensiveYN'].round(0)
# test_df.loc[test_df['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'

# test_dataset = datasets.Dataset.from_pandas(test_df)

In [18]:
from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()
# y_train=label_encoder.fit_transform(train_dataset['offensiveYN'])

# label_encoder = LabelEncoder()
# y_val=label_encoder.fit_transform(val_dataset['offensiveYN'])

# label_encoder = LabelEncoder()
# y_test=label_encoder.fit_transform(test_dataset['offensiveYN'])

In [19]:
class HateDataset(Dataset):
    def __init__(self, dataset, tokenizer, model):
        dataset = dataset.to_pandas()
        dataset = dataset[dataset['offensiveYN'] != '']
        dataset.loc[dataset['offensiveYN'] == '0.5', 'offensiveYN'] = '1.0'
        dataset = dataset.groupby(['post','offensiveYN']).size().reset_index(name='counts')
        dataset = dataset.sort_values('counts', ascending=False).drop_duplicates('post')

        dataset = datasets.Dataset.from_pandas(dataset)
        
        label_encoder = LabelEncoder()
        
        self.label = label_encoder.fit_transform(dataset['offensiveYN'])
        
        self.post = dataset['post']
        
        self.tokenizer = tokenizer
        self.model = model

    def __len__(self):
        return len(self.label)
        
    def __getitem__(self, idx):
        # Tokenize the text
        tokenized_post = self.tokenizer(self.post[idx], return_tensors='pt',max_length=50, padding='max_length', truncation=True)
        
#         # Forward pass through the model
#         with torch.no_grad():
#             model_output = self.model(**tokenized_post)
        with torch.no_grad():
            all_layers = self.model(input_ids=tokenized_post['input_ids'], attention_mask=tokenized_post['attention_mask'], output_hidden_states=True)
        # Return label and last hidden state
        return self.label[idx], all_layers.hidden_states

In [20]:
from tqdm import tqdm
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

training_data = HateDataset(train_dataset,tokenizer,bert_model)
train_dataloader = DataLoader(training_data , batch_size=1 , shuffle=True)

validation_data = HateDataset(valid_dataset,tokenizer,bert_model)
val_dataloader = DataLoader(validation_data , batch_size=1 , shuffle=False)

testing_data = HateDataset(test_dataset,tokenizer,bert_model)
test_dataloader = DataLoader(testing_data , batch_size=1 , shuffle=False)

model = BERT_Arch()
model = model.to(device)

# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion=nn.CrossEntropyLoss()

for epoch in range(10):
    # Training phase
    model.train()
    train_running_loss = 0.0
    train_all_predictions = []
    train_all_labels = []

    for labels, all_layers in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{10} - Training"):
        labels = torch.tensor(labels)
        for i in range(len(all_layers)):
            all_layers[i] = all_layers[i].squeeze().unsqueeze(0)
        all_layers = torch.stack(all_layers)
        all_layers = all_layers.to(device)
        labels = labels.to(device)
        model.zero_grad()
        optimizer.zero_grad()

        outputs = model(all_layers)
#         print(outputs)
        one_hot_targets = torch.zeros(1, 2).to(device)
        one_hot_targets.scatter_(1, labels.unsqueeze(1), 1)
        
#         print(outputs)
#         print(one_hot_targets)
        
        loss = criterion(outputs.squeeze(), one_hot_targets.squeeze())
        loss.backward()
        optimizer.step()

        train_running_loss += loss.item()

        predicted = torch.argmax(outputs, dim=1)
        train_all_predictions.extend(predicted.cpu().tolist())
        train_all_labels.extend(labels.cpu().tolist())

    train_epoch_loss = train_running_loss / len(train_dataloader)
#     print(train_all_labels)
#     print(train_all_predictions)
    train_epoch_accuracy = accuracy_score(train_all_labels, train_all_predictions)
    train_epoch_f1 = f1_score(train_all_labels, train_all_predictions, average='macro')

    print(f"Epoch {epoch+1}/{10} - Training, Loss: {train_epoch_loss:.4f}, Accuracy: {train_epoch_accuracy:.4f}, F1: {train_epoch_f1:.4f}")

    # Validation phase
    model.eval()
    val_running_loss = 0.0
    val_all_predictions = []
    val_all_labels = []

    with torch.no_grad():
        for labels, all_layers in tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{10} - Validation"):
            labels = torch.tensor(labels)
            for i in range(len(all_layers)):
                all_layers[i] = all_layers[i].squeeze().unsqueeze(0)
            all_layers = torch.stack(all_layers)
            all_layers = all_layers.to(device)
            labels=labels.to(device)
#             inputs = torch.tensor(inputs[0])
#             labels = torch.tensor([labels[0]])
#             inputs = inputs.to(device)
#             labels = labels.to(device)
            outputs = model(all_layers)
            one_hot_targets = torch.zeros(1, 2).to(device)
            one_hot_targets.scatter_(1, labels.unsqueeze(1), 1)
            loss = criterion(outputs.squeeze(), one_hot_targets.squeeze())

            val_running_loss += loss.item()

            predicted = torch.argmax(outputs, dim=1)
            val_all_predictions.extend(predicted.cpu().tolist())
            val_all_labels.extend(labels.cpu().tolist())

        val_epoch_loss = val_running_loss / len(val_dataloader)
        val_epoch_accuracy = accuracy_score(val_all_labels, val_all_predictions)
        val_epoch_f1 = f1_score(val_all_labels, val_all_predictions, average='macro')

        print(f"Epoch {epoch+1}/{10} - Validation, Loss: {val_epoch_loss:.4f}, Accuracy: {val_epoch_accuracy:.4f}, F1: {val_epoch_f1:.4f}")


    torch.save(model.state_dict(), f'Model_BERT_CNN_{epoch+1}.pth')

  labels = torch.tensor(labels)
Epoch 1/10 - Training: 100%|██████████| 4083/4083 [06:36<00:00, 10.31it/s]


Epoch 1/10 - Training, Loss: 0.5729, Accuracy: 0.7120, F1: 0.6385


  labels = torch.tensor(labels)
Epoch 1/10 - Validation: 100%|██████████| 2037/2037 [03:13<00:00, 10.51it/s]


Epoch 1/10 - Validation, Loss: 0.6000, Accuracy: 0.6672, F1: 0.6666


  labels = torch.tensor(labels)
Epoch 2/10 - Training: 100%|██████████| 4083/4083 [06:32<00:00, 10.40it/s]


Epoch 2/10 - Training, Loss: 0.4722, Accuracy: 0.7891, F1: 0.7458


  labels = torch.tensor(labels)
Epoch 2/10 - Validation: 100%|██████████| 2037/2037 [03:13<00:00, 10.55it/s]


Epoch 2/10 - Validation, Loss: 0.5946, Accuracy: 0.6706, F1: 0.6706


  labels = torch.tensor(labels)
Epoch 3/10 - Training: 100%|██████████| 4083/4083 [06:35<00:00, 10.34it/s]


Epoch 3/10 - Training, Loss: 0.3950, Accuracy: 0.8246, F1: 0.7910


  labels = torch.tensor(labels)
Epoch 3/10 - Validation: 100%|██████████| 2037/2037 [03:14<00:00, 10.49it/s]


Epoch 3/10 - Validation, Loss: 0.6241, Accuracy: 0.6519, F1: 0.6515


  labels = torch.tensor(labels)
Epoch 4/10 - Training: 100%|██████████| 4083/4083 [06:32<00:00, 10.39it/s]


Epoch 4/10 - Training, Loss: 0.3243, Accuracy: 0.8651, F1: 0.8416


  labels = torch.tensor(labels)
Epoch 4/10 - Validation: 100%|██████████| 2037/2037 [03:13<00:00, 10.54it/s]


Epoch 4/10 - Validation, Loss: 0.6636, Accuracy: 0.6539, F1: 0.6504


  labels = torch.tensor(labels)
Epoch 5/10 - Training: 100%|██████████| 4083/4083 [06:34<00:00, 10.34it/s]


Epoch 5/10 - Training, Loss: 0.2520, Accuracy: 0.9033, F1: 0.8876


  labels = torch.tensor(labels)
Epoch 5/10 - Validation: 100%|██████████| 2037/2037 [03:13<00:00, 10.53it/s]


Epoch 5/10 - Validation, Loss: 0.6327, Accuracy: 0.6848, F1: 0.6848


  labels = torch.tensor(labels)
Epoch 6/10 - Training: 100%|██████████| 4083/4083 [06:32<00:00, 10.41it/s]


Epoch 6/10 - Training, Loss: 0.1953, Accuracy: 0.9329, F1: 0.9230


  labels = torch.tensor(labels)
Epoch 6/10 - Validation: 100%|██████████| 2037/2037 [03:12<00:00, 10.60it/s]


Epoch 6/10 - Validation, Loss: 0.7355, Accuracy: 0.6573, F1: 0.6538


  labels = torch.tensor(labels)
Epoch 7/10 - Training: 100%|██████████| 4083/4083 [06:32<00:00, 10.41it/s]


Epoch 7/10 - Training, Loss: 0.1503, Accuracy: 0.9513, F1: 0.9446


  labels = torch.tensor(labels)
Epoch 7/10 - Validation: 100%|██████████| 2037/2037 [03:12<00:00, 10.60it/s]


Epoch 7/10 - Validation, Loss: 0.8478, Accuracy: 0.6451, F1: 0.6367


  labels = torch.tensor(labels)
Epoch 8/10 - Training: 100%|██████████| 4083/4083 [06:35<00:00, 10.33it/s]


Epoch 8/10 - Training, Loss: 0.1134, Accuracy: 0.9709, F1: 0.9670


  labels = torch.tensor(labels)
Epoch 8/10 - Validation: 100%|██████████| 2037/2037 [03:12<00:00, 10.57it/s]


Epoch 8/10 - Validation, Loss: 0.8069, Accuracy: 0.6642, F1: 0.6641


  labels = torch.tensor(labels)
Epoch 9/10 - Training: 100%|██████████| 4083/4083 [06:35<00:00, 10.32it/s]


Epoch 9/10 - Training, Loss: 0.0906, Accuracy: 0.9758, F1: 0.9727


  labels = torch.tensor(labels)
Epoch 9/10 - Validation: 100%|██████████| 2037/2037 [03:16<00:00, 10.39it/s]


Epoch 9/10 - Validation, Loss: 0.7736, Accuracy: 0.6878, F1: 0.6876


  labels = torch.tensor(labels)
Epoch 10/10 - Training: 100%|██████████| 4083/4083 [06:36<00:00, 10.30it/s]


Epoch 10/10 - Training, Loss: 0.0746, Accuracy: 0.9799, F1: 0.9774


  labels = torch.tensor(labels)
Epoch 10/10 - Validation: 100%|██████████| 2037/2037 [03:12<00:00, 10.58it/s]


Epoch 10/10 - Validation, Loss: 0.8811, Accuracy: 0.6637, F1: 0.6636
