<a href="https://colab.research.google.com/github/DeoChuanito/My-Paper-on-detecting-cyberbullying-in-X/blob/main/hyperparameter_tuned_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# git clone -b master https://github.com/charles9n/bert-sklearn
# cd bert-sklearn
# !pip install .
!git clone -b master https://github.com/charles9n/bert-sklearn
!cd bert-sklearn; pip install .
import os
os.chdir("bert-sklearn")

Cloning into 'bert-sklearn'...
remote: Enumerating objects: 259, done.[K
remote: Total 259 (delta 0), reused 0 (delta 0), pack-reused 259[K
Receiving objects: 100% (259/259), 516.15 KiB | 1.41 MiB/s, done.
Resolving deltas: 100% (131/131), done.
Processing /content/bert-sklearn
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3 (from bert-sklearn==0.3.1)
  Downloading boto3-1.34.121-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=0.4.1->bert-sklearn==0.3.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=0.4.1->bert-sklearn==0.3.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=0.4.1->bert-sklearn==0.3.1)
  Using cac

In [None]:
import os
import random
import itertools
import pandas as pd
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from typing import Union, List
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import load_model

import warnings
warnings.filterwarnings('ignore')


plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['font.size'] = 14

## EDA

In [None]:
def load_data(dataFile:str) -> Union[List, List]:
    df = pd.read_csv(dataFile)
    texts = df['tweet_text'].tolist()
    labels = df['cyberbullying_type'].tolist()
    return texts, labels

In [None]:
dataFile = '/content/cyberbullying_tweets.csv'
texts, labels = load_data(dataFile)
len(texts), len(labels)

In [None]:
df = pd.DataFrame({'text':texts, 'labels':labels})
df

In [None]:
labEncoder = LabelEncoder()
labEncoder.fit(labels)
labelsEncoded = labEncoder.transform(labels)

In [None]:
any(df.isna().sum())

In [None]:
df.info()

In [None]:
plt.figure(figsize=(12, 7))
sb.countplot(data=df, x='labels')
plt.title("Label Distribution")
plt.show()

In [None]:
df.describe()

In [None]:
df['length'] = df['text'].str.len()
sb.boxplot(data=df, y='length')
plt.title("Length of Tweets")
plt.show()

## Reduce the data

In [None]:
df = df[((df['length'] > 10)	& (df['length'] < 30))]
df.info()
sb.boxplot(df['length'])

In [None]:
def word_count(x):
    return len(x.split())

df['word_count'] = df['text'].apply(word_count)

In [None]:
sb.boxplot(data=df, y='word_count')
plt.title("Word Count in Tweets")
plt.show()

## Preprocessing (Data building)

In [None]:
df['labels'].value_counts()
# df['text'].value_counts()

In [None]:
le = LabelEncoder()

# df['text'] = le.fit_transform(df['text'])
df['labels'] = le.fit_transform(df['labels'])
df.info()

In [None]:
# Hyperparameter grid
# param_grid = {
#     'learning_rate': [1e-5, 2e-5, 3e-5],
#     'batch_size': [8, 16],
#     'num_epochs': [3, 4, 5]
# }

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

## Modelling

In [None]:
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [None]:
x_train = df['text']
y_train = df['labels']

train_texts, val_texts, train_labels, val_labels = tts(x_train, y_train, test_size=0.2, random_state=42)

In [None]:
class BERTClassifier(torch.nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = torch.nn.Dropout(0.1)
        self.fc = torch.nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            actuals.extend(labels.cpu().numpy())
    accuracy = accuracy_score(actuals, predictions)
    report = classification_report(actuals, predictions)
    return accuracy, report

In [None]:
class BERTClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=20, learning_rate=1e-5, batch_size=16, num_epochs=3):
        self.bert_model_name = bert_model_name
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.build_model()

    def tokenize_data(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        encoded = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
        return encoded

    def build_model(self):
        model = BertModel.from_pretrained(self.bert_model_name)
        for param in model.parameters():
            param.requires_grad = False

        classifier = nn.Sequential(
            nn.Linear(model.config.hidden_size, self.num_classes)
        )

        model.classifier = classifier
        return model.to(self.device)

    def tokenize_data(self, texts):
        return self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    def fit(self, X, y):
        print(f'Fitting model with learning_rate={self.learning_rate}, batch_size={self.batch_size}, num_epochs={self.num_epochs}')

        self.model.train()
        optimizer = optim.Adam(self.model.classifier.parameters(), lr=self.learning_rate)
        loss_fn = nn.CrossEntropyLoss()

        dataset = TextDataset(X, y, self.tokenizer)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.num_epochs):
            epoch_loss = 0
            for batch in dataloader:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.last_hidden_state[:, 0, :]  # Get [CLS] token output
                logits = self.model.classifier(logits)

                loss = loss_fn(logits, labels)
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()

            print(f'Epoch {epoch+1}/{self.num_epochs}, Loss: {epoch_loss/len(dataloader)}')

    def predict(self, X):
        self.model.eval()
        dataset = TextDataset(X, None, self.tokenizer, is_test=True)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)

        all_preds = []
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)

                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.last_hidden_state[:, 0, :]  # Get [CLS] token output
                logits = self.model.classifier(logits)

                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())

        return np.array(all_preds)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, is_test=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.is_test = is_test

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        item = {key: val.squeeze(0) for key, val in encoded.items()}

        if not self.is_test:
            item['labels'] = torch.tensor(self.labels[idx])

        return item

## Hyperparameter tuning

In [None]:
dataset = pd.DataFrame(df)
print(dataset)

In [None]:
dataset = pd.DataFrame(df)
Xtrain = dataset['text'].tolist()
Ytrain = dataset['labels'].tolist()

# print(dataset)
# print(Xtrain)
# print(Ytrain)

# Xtrain = df['text']
# Ytrain = df['labels']

In [None]:
# Grid search parameters
param_grid = {
    'max_seq_length': [128],
    'learning_rate': [5e-5],
    'train_batch_size' : [16],
    'epochs': [3]
}
model = BertClassifier(validation_fraction=0, max_seq_length=64)
# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=2,
                           verbose=1,
                           n_jobs=1,
                           error_score='raise')

# Perform grid search
grid_search.fit(Xtrain, Ytrain)

Building sklearn text classifier...
Building sklearn text classifier...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Building sklearn text classifier...


100%|██████████| 231508/231508 [00:00<00:00, 4110638.09B/s]


Loading bert-base-uncased model...


100%|██████████| 440473133/440473133 [00:17<00:00, 25418336.03B/s]
100%|██████████| 433/433 [00:00<00:00, 1000073.59B/s]


Defaulting to linear classifier/regressor
Loading Pytorch checkpoint

train data size: 856, validation data size: 0



Training  :   0%|          | 0/54 [00:00<?, ?it/s]

KeyboardInterrupt: 

## getting the best params after tuned

In [None]:
# Print best parameters and results
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)