<a href="https://colab.research.google.com/github/ZeroxTM/BERT-CNN-Fine-Tuning-For-Hate-Speech-Detection-in-Online-Social-Media/blob/main/BertCnnFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers==3.0.0 emoji

import gc
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertModel, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader

# Device setup
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

# Define the BERT-based model architecture
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.conv = nn.Conv2d(in_channels=13, out_channels=13, kernel_size=(3, 768), padding=(1, 0))
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=(3, 1), stride=(1, 1))
        self.dropout = nn.Dropout(0.1)
        self.flat = nn.Flatten()
        self.fc = nn.Linear(442, 3)  # Adjust if max_length changes
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        # Get all hidden states
        outputs = self.bert(sent_id, attention_mask=mask, output_hidden_states=True)
        all_layers = outputs.hidden_states

        # Concatenate and process hidden layers
        x = torch.transpose(torch.cat(tuple([t.unsqueeze(0) for t in all_layers]), 0), 0, 1)
        del all_layers
        gc.collect()
        torch.mps.empty_cache()

        # CNN and fully connected layers
        x = self.pool(self.dropout(self.relu(self.conv(self.dropout(x)))))
        x = self.flat(self.dropout(x))
        x = self.fc(self.dropout(x))
        return self.softmax(x)

# Preprocessing functions
def read_dataset():
    data = pd.read_csv("labeled_data.csv")
    data = data.drop(['count', 'hate_speech', 'offensive_language', 'neither'], axis=1)
    print(f"Dataset size: {len(data)}")
    return data['tweet'].tolist(), data['class']

def pre_process_dataset(values):
    processed_values = []
    for value in values:
        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", value.lower())
        text = re.sub(r"([?.!,¿])", r" ", text)
        text = "".join(l for l in text if l not in string.punctuation)
        text = re.sub(r'[" "]+', " ", text)
        processed_values.append(text.strip())
    return processed_values

def data_process(data, labels):
    input_ids = []
    attention_masks = []
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    for sentence in data:
        bert_input = tokenizer(sentence, max_length=36, padding='max_length', truncation=True, return_tensors="pt")
        input_ids.append(bert_input['input_ids'].squeeze(0))
        attention_masks.append(bert_input['attention_mask'].squeeze(0))

    input_ids = torch.stack(input_ids)
    attention_masks = torch.stack(attention_masks)
    labels = torch.tensor(labels.values if isinstance(labels, pd.Series) else labels)

    return input_ids, attention_masks, labels

def load_and_process():
    data, labels = read_dataset()
    data = pre_process_dataset(data)
    return data_process(data, labels)

# Training and evaluation functions
def train():
    model.train()
    total_loss, total_preds = 0, []

    for step, batch in enumerate(train_dataloader):
        sent_id, mask, labels = [item.to(device) for item in batch]
        model.zero_grad()

        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_preds.append(preds.detach().cpu())

    return total_loss / len(train_dataloader), torch.cat(total_preds)

def evaluate():
    model.eval()
    total_loss, total_preds = 0, []

    with torch.no_grad():
        for step, batch in enumerate(val_dataloader):
            sent_id, mask, labels = [item.to(device) for item in batch]

            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += loss.item()
            total_preds.append(preds.detach().cpu())

    return total_loss / len(val_dataloader), torch.cat(total_preds)

# Main script
input_ids, attention_masks, labels = load_and_process()

# Split the dataset
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.1, random_state=42)
train_masks, val_masks = train_test_split(attention_masks, test_size=0.1, random_state=42)

# Create DataLoaders
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

# Initialize the model
bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model = BERT_Arch(bert).to(device)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
cross_entropy = nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1} of {epochs}")
    train_loss, _ = train()
    val_loss, _ = evaluate()
    print(f"Train Loss: {train_loss}, Validation Loss: {val_loss}")

# Evaluation on validation data
val_loss, val_preds = evaluate()
val_preds = torch.argmax(val_preds, axis=1)

print("\nValidation Performance:")
print(classification_report(val_labels, val_preds))
print(f"Accuracy: {accuracy_score(val_labels, val_preds):.4f}")
torch.mps.empty_cache()

with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

print("Performance:")
# model's performance
preds = np.argmax(preds, axis=1)
print('Classification Report')
print(classification_report(test_y, press))

# print("Accuracy: " + str(accuracy_score(test_y, preds)))

Collecting transformers==3.0.0
  Using cached transformers-3.0.0-py3-none-any.whl.metadata (44 kB)
Collecting emoji
  Using cached emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting tokenizers==0.8.0-rc4 (from transformers==3.0.0)
  Using cached tokenizers-0.8.0rc4.tar.gz (96 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting sentencepiece (from transformers==3.0.0)
  Using cached sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting sacremoses (from transformers==3.0.0)
  Using cached sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Using cached transformers-3.0.0-py3-none-any.whl (754 kB)
Using cached emoji-2.14.0-py3-none-any.whl (586 kB)
Using cached sacremoses-0.1.1-py3-none-any.whl (897 kB)
Using cached sentencepiece-0.2.0-cp312-cp312-macosx_11_0_arm64.whl (1.2 MB)
Building wheels for collected packages: tokenizer

In [None]:
import torch

# Check for MPS availability
if torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print(f"Using device: {device}")