In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import zipfile
import os

project_path="/content/drive/MyDrive/Project/BERT-CNN-Fine-Tuning-For-Hate-Speech-Detection-in-Online-Social-Media"
# Replace 'your_file.zip' with the actual name of your uploaded zip file
os.chdir(project_path)
print("current dir",os.getcwd())

current dir /content/drive/MyDrive/Project/BERT-CNN-Fine-Tuning-For-Hate-Speech-Detection-in-Online-Social-Media


In [4]:

files = os.listdir()
print("Files in directory:", files)

Files in directory: ['LICENSE', 'BertCnnFinal.ipynb', 'README.md', 'BERT-CNN-Demo.mp4', 'labeled_data.csv', 'Images', '.ipynb_checkpoints', '.git', '__pycache__', 'Pre_Process.py', 'Model.py', 'BertCNN.py', 'Untitled3.ipynb']


In [5]:
import sys
sys.path.append('/content/drive/MyDrive/Project/BERT-CNN-Fine-Tuning-For-Hate-Speech-Detection-in-Online-Social-Media')


In [6]:
import importlib.util

# Define the full path to Model.py
model_path = "/content/drive/MyDrive/Project/BERT-CNN-Fine-Tuning-For-Hate-Speech-Detection-in-Online-Social-Media/Model.py"

# Load Model.py dynamically
spec = importlib.util.spec_from_file_location("Model", model_path)
Model = importlib.util.module_from_spec(spec)
spec.loader.exec_module(Model)

# Use the imported class or function
BERT_CNN = Model.BERT_CNN


In [7]:
model_path = "/content/drive/MyDrive/Project/BERT-CNN-Fine-Tuning-For-Hate-Speech-Detection-in-Online-Social-Media/Pre_Process.py"
spec1 = importlib.util.spec_from_file_location("Model", model_path)
Pre_Process = importlib.util.module_from_spec(spec1)
spec1.loader.exec_module(Pre_Process)

# Use the imported class or function
load_and_process = Pre_Process.load_and_process


In [8]:
# pip install emoji

In [17]:
import gc
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertModel, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define the BERT-large model architecture
class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert

        # CNN layers
        self.conv = nn.Conv2d(in_channels=25, out_channels=25, kernel_size=(3, 1024), padding=(1, 0))
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=(3, 1), stride=(1, 1))
        self.dropout = nn.Dropout(0.1)

        # Dynamically calculate the flattened size for the fc layer
        self._initialize_fc_layer()

    def _initialize_fc_layer(self):
        # Dummy input to calculate the flattened size dynamically
        dummy_input = torch.zeros(1, 25, 36, 1024)  # (batch_size, in_channels, seq_length, hidden_size)
        dummy_output = self.pool(self.dropout(self.relu(self.conv(self.dropout(dummy_input)))))
        flattened_size = dummy_output.numel()  # Total number of elements
        self.flat = nn.Flatten()
        self.fc = nn.Linear(flattened_size, 3)  # Update the size dynamically
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        # Get all hidden states (25 layers for bert-large-uncased)
        outputs = self.bert(sent_id, attention_mask=mask, output_hidden_states=True)
        all_layers = torch.stack(outputs.hidden_states, dim=0)  # Shape: (25, batch_size, seq_length, hidden_size)

        # Permute to shape (batch_size, 25, seq_length, hidden_size)
        x = all_layers.permute(1, 0, 2, 3)

        # Apply CNN and fully connected layers
        x = self.pool(self.dropout(self.relu(self.conv(self.dropout(x)))))
        x = self.flat(x)
        x = self.fc(self.dropout(x))
        return self.softmax(x)

# Preprocessing functions
def read_dataset():
    data = pd.read_csv("labeled_data.csv")
    data = data.drop(['count', 'hate_speech', 'offensive_language', 'neither'], axis=1)
    print(f"Dataset size: {len(data)}")
    return data['tweet'].tolist(), data['class']

def pre_process_dataset(values):
    processed_values = []
    for value in values:
        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", value.lower())
        text = re.sub(r"([?.!,¿])", r" ", text)
        text = "".join(l for l in text if l not in string.punctuation)
        text = re.sub(r'[" "]+', " ", text)
        processed_values.append(text.strip())
    return processed_values

def data_process(data, labels):
    input_ids = []
    attention_masks = []
    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

    for sentence in data:
        bert_input = tokenizer(sentence, max_length=36, padding='max_length', truncation=True, return_tensors="pt")
        input_ids.append(bert_input['input_ids'].squeeze(0))
        attention_masks.append(bert_input['attention_mask'].squeeze(0))

    input_ids = torch.stack(input_ids)
    attention_masks = torch.stack(attention_masks)
    labels = torch.tensor(labels.values if isinstance(labels, pd.Series) else labels)

    return input_ids, attention_masks, labels

def load_and_process():
    data, labels = read_dataset()
    data = pre_process_dataset(data)
    return data_process(data, labels)

# Training and evaluation functions
def train():
    model.train()
    total_loss, total_preds = 0, []

    for step, batch in enumerate(train_dataloader):
        sent_id, mask, labels = [item.to(device) for item in batch]
        model.zero_grad()

        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_preds.append(preds.detach().cpu())

    return total_loss / len(train_dataloader), torch.cat(total_preds)

def evaluate():
    model.eval()
    total_loss, total_preds = 0, []

    with torch.no_grad():
        for step, batch in enumerate(val_dataloader):
            sent_id, mask, labels = [item.to(device) for item in batch]

            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += loss.item()
            total_preds.append(preds.detach().cpu())

    return total_loss / len(val_dataloader), torch.cat(total_preds)

# Main script
input_ids, attention_masks, labels = load_and_process()

# Split the dataset
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.1, random_state=42)
train_masks, val_masks = train_test_split(attention_masks, test_size=0.1, random_state=42)

# Create DataLoaders
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

# Initialize the model
bert = BertModel.from_pretrained('bert-large-uncased', output_hidden_states=True)
model = BERT_Arch(bert).to(device)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
cross_entropy = nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1} of {epochs}")
    train_loss, _ = train()
    val_loss, _ = evaluate()
    print(f"Train Loss: {train_loss}, Validation Loss: {val_loss}")

# Evaluation on validation data
val_loss, val_preds = evaluate()
val_preds = torch.argmax(val_preds, axis=1)

print("\nValidation Performance:")
print(classification_report(val_labels, val_preds))
print(f"Accuracy: {accuracy_score(val_labels, val_preds):.4f}")
torch.cuda.empty_cache()


Using device: cuda
Dataset size: 24783
Epoch 1 of 3
Train Loss: 0.30312098621242445, Validation Loss: 0.2718863500138888
Epoch 2 of 3
Train Loss: 0.21480255684089797, Validation Loss: 0.2624606180171936
Epoch 3 of 3
Train Loss: 0.15158758307787104, Validation Loss: 0.29335811753303576

Validation Performance:
              precision    recall  f1-score   support

           0       0.52      0.35      0.42       164
           1       0.92      0.96      0.94      1905
           2       0.91      0.84      0.88       410

    accuracy                           0.90      2479
   macro avg       0.78      0.72      0.75      2479
weighted avg       0.89      0.90      0.90      2479

Accuracy: 0.9028
