In [1]:
!pip install transformers[torch] pandas scikit-learn seaborn
!pip install vaderSentiment
!pip install empath
!pip install textblob

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<2.7,>=2.1->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<2.7,>=2.1->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<2.7,>=2.1->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<2.7,>=2.1->transformers[torch])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<2.7,>=2.1->transformers[torch])
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<2.7,>=2.1->transformers

In [2]:
# Section 1: Initial setup & imports

import os
import random
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Union, List
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from empath import Empath
from textblob import TextBlob
import warnings

warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['font.size'] = 14

2025-07-23 07:02:43.581293: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753254163.948742      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753254164.050814      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Section 2: Data loading and preparation

def load_data(dataFile: str) -> pd.DataFrame:
    """Loads data from a CSV file into a pandas DataFrame."""
    return pd.read_csv(dataFile)

# Load the dataset
dataFile = '/kaggle/input/cyberbullying-classification/cyberbullying_tweets.csv'
df_full = load_data(dataFile) 

texts = df_full['tweet_text'].tolist()
labels = df_full['cyberbullying_type'].tolist()

# Encode labels for multiclass classification
labEncoder = LabelEncoder()
labelsEncoded = labEncoder.fit_transform(labels)
print("Cyberbullying Types (Multiclass):", labEncoder.classes_)
print(f"Total samples: {len(texts)}")

Cyberbullying Types (Multiclass): ['age' 'ethnicity' 'gender' 'not_cyberbullying' 'other_cyberbullying'
 'religion']
Total samples: 47692


In [4]:
# Section 3: EXPLORATORY DATA ANALYSIS (EDA)

print("\nPerforming Exploratory Data Analysis...")

# Create a DataFrame for EDA
df_eda = pd.DataFrame({'text': texts, 'label_name': labels})

## -----------------------------------------------------------------------------
## EDA Part 1: Top Words per Cyberbullying Category (TF-IDF)
## -----------------------------------------------------------------------------
print("\n--- Top Words per Cyberbullying Category (TF-IDF) ---")
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

# Group texts by the cyberbullying type
grouped_texts = df_eda.groupby('label_name')['text'].apply(lambda x: ' '.join(x))

for category, text_corpus in grouped_texts.items():
    print(f"\nTop 15 words for '{category}':")
    
    # Fit TF-IDF on the corpus for the category
    tfidf_matrix = vectorizer.fit_transform([text_corpus])
    
    # Get feature names and their scores
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray().flatten()
    
    # Create a DataFrame of words and scores, and get the top 15
    df_tfidf = pd.DataFrame({'word': feature_names, 'tfidf': scores})
    top_words = df_tfidf.sort_values(by='tfidf', ascending=False).head(15)
    
    print(top_words['word'].tolist())


Performing Exploratory Data Analysis...

--- Top Words per Cyberbullying Category (TF-IDF) ---

Top 15 words for 'age':
['school', 'high', 'bullied', 'bully', 'girl', 'girls', 'like', 'bullies', 'just', 'people', 'got', 'middle', 'amp', 'don', 'kids']

Top 15 words for 'ethnicity':
['fuck', 'dumb', 'nigger', 'ass', 'black', 'white', 'niggers', 'rt', 'people', 'obama', 'like', 'ur', 'tayyoung_', 'bitch', 'called']

Top 15 words for 'gender':
['rape', 'gay', 'jokes', 'joke', 'rt', 'people', 'sexist', 'just', 'like', 'women', 'bitch', 'female', 'don', 'http', 'funny']

Top 15 words for 'not_cyberbullying':
['mkr', 'http', 'rt', 'bullying', 'just', 'bully', 'like', 'don', 'school', 'kat', 'amp', 'people', 'know', 'time', 'andre']

Top 15 words for 'other_cyberbullying':
['rt', 'http', 'just', 'bully', 'https', 'like', 'don', 'people', 'fucking', 'know', 'mkr', 've', 'think', 'time', 'bullying']

Top 15 words for 'religion':
['muslims', 'muslim', 'idiot', 'christian', 'idiots', 'islamic', 

In [5]:
## -----------------------------------------------------------------------------
## EDA Part 2: Psychological Feature Extraction
## -----------------------------------------------------------------------------
print("\n--- Extracting and Normalizing Psychological Features ---")

# Initialize analyzers
analyzer = SentimentIntensityAnalyzer()
lexicon = Empath()

# Define feature extraction functions
def extract_sentiment_features(text):
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores['neg'], sentiment_scores['neu'], sentiment_scores['pos'], sentiment_scores['compound']

def extract_empath_features(text):
    # Using a predefined set of categories for efficiency
    cats = ["hate", "cheerfulness", "aggression", "sadness", "fear", "positive_emotion", "negative_emotion"]
    empath_scores = lexicon.analyze(text, categories=cats, normalize=True)
    return [empath_scores[cat] for cat in cats] if empath_scores else [0.0] * len(cats)

def extract_textblob_features(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

# Apply feature extraction
df_features = pd.DataFrame(texts, columns=['text'])
sentiment_features = df_features['text'].apply(lambda x: pd.Series(extract_sentiment_features(x)))
sentiment_features.columns = ['neg_sentiment', 'neu_sentiment', 'pos_sentiment', 'compound_sentiment']

empath_features = df_features['text'].apply(lambda x: pd.Series(extract_empath_features(x)))
empath_features.columns = ["hate", "cheerfulness", "aggression", "sadness", "fear", "positive_emotion", "negative_emotion"]

textblob_features = df_features['text'].apply(lambda x: pd.Series(extract_textblob_features(x)))
textblob_features.columns = ['polarity', 'subjectivity']

# Combine all psychological features
psychological_features = pd.concat([sentiment_features, empath_features, textblob_features], axis=1)
psychological_features.fillna(0, inplace=True)

# Normalize psychological features
scaler = MinMaxScaler()
psych_features_scaled = scaler.fit_transform(psychological_features)
print(f"Shape of psychological features matrix: {psych_features_scaled.shape}")



--- Extracting and Normalizing Psychological Features ---
Shape of psychological features matrix: (47692, 13)


In [6]:
# Section 4: Dataset and Model Definition

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, psych_features):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.psych_features = psych_features

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        psych_feature = torch.tensor(self.psych_features[idx], dtype=torch.float32)
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long),
            'psych_feature': psych_feature
        }

class BERTWithPsychFeatures(nn.Module):
    def __init__(self, bert_model_name, num_classes, num_psych_features):
        super(BERTWithPsychFeatures, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.drop = nn.Dropout(p=0.3)
        
        bert_hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Linear(bert_hidden_size + num_psych_features, num_classes)

    def forward(self, input_ids, attention_mask, psych_feature):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_output['pooler_output']
        combined_features = torch.cat((pooled_output, psych_feature), dim=1)
        output = self.drop(combined_features)
        return self.classifier(output)


In [7]:
# Section 5: Utility Functions for Training and Evaluation


from torch.cuda.amp import GradScaler, autocast

def train_epoch_optimized(model, data_loader, optimizer, scheduler, device, scaler):
    """
    Optimized training function using Automatic Mixed Precision (AMP).
    """
    model = model.train()
    
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        psych_feature = batch['psych_feature'].to(device)
        labels = batch['label'].to(device)
        
        # Using autocast for the forward pass. This automatically casts operations to float16 where it is safe, speeding up computation.
        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                psych_feature=psych_feature
            )
            loss = nn.CrossEntropyLoss()(outputs, labels)

        # scaler.scale() scales the loss to prevent underflow of small gradients.
        scaler.scale(loss).backward()
        
        # scaler.step() updates the weights and scaler.update() updates the scale factor.
        scaler.step(optimizer)
        scaler.update()
        
        scheduler.step()

def evaluate(model, data_loader, device):
    """
    Evaluation function that returns the lists of actual and predicted labels.
    """
    model.eval()
    predictions = []
    actual = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            psych_feature = batch['psych_feature'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                psych_feature=psych_feature
            )

            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual.extend(labels.cpu().tolist())
    

    return actual, predictions

In [8]:
# Section 6: MULTICLASS CLASSIFICATION 

print("\n\n" + "="*60)
print("PART 1: MULTICLASS CLASSIFICATION (6 TYPES)")
print("="*60)

# Model Parameters
bert_model_name = 'bert-base-uncased'   #Uncased base model of BERT is used
max_length = 128
batch_size = 32
num_epochs_multiclass = 4 
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Data Splitting
train_texts, val_texts, train_labels, val_labels, train_psych_features, val_psych_features = train_test_split(
    texts, labelsEncoded, psych_features_scaled, test_size=0.2, random_state=42, stratify=labelsEncoded
)

# Dataloader Setup 
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
num_psych_features = psych_features_scaled.shape[1]

train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length, train_psych_features)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length, val_psych_features)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Model Initialization
model_multiclass = BERTWithPsychFeatures(bert_model_name, num_classes=len(labEncoder.classes_), num_psych_features=num_psych_features).to(device)
optimizer = AdamW(model_multiclass.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs_multiclass
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Initializing the GradScaler for mixed-precision training
scaler = GradScaler()

# Training Loop
for epoch in range(num_epochs_multiclass):
    print(f"\n--- Multiclass Epoch {epoch+1}/{num_epochs_multiclass} ---")
    
    # Using the optimized training function
    train_epoch_optimized(model_multiclass, train_dataloader, optimizer, scheduler, device, scaler)
    
    # 1. Getting the raw lists of labels from the evaluate function
    actual_labels, predicted_labels = evaluate(model_multiclass, val_dataloader, device)
    
    # 2. Now, calculating the accuracy and report using the returned lists
    accuracy = accuracy_score(actual_labels, predicted_labels)
    report = classification_report(actual_labels, predicted_labels, target_names=labEncoder.classes_) 

    
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)



PART 1: MULTICLASS CLASSIFICATION (6 TYPES)
Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


--- Multiclass Epoch 1/4 ---
Validation Accuracy: 0.8600
                     precision    recall  f1-score   support

                age       0.98      0.99      0.99      1598
          ethnicity       0.99      0.97      0.98      1592
             gender       0.84      0.92      0.88      1595
  not_cyberbullying       0.83      0.43      0.57      1589
other_cyberbullying       0.64      0.86      0.74      1565
           religion       0.92      0.98      0.95      1600

           accuracy                           0.86      9539
          macro avg       0.87      0.86      0.85      9539
       weighted avg       0.87      0.86      0.85      9539


--- Multiclass Epoch 2/4 ---
Validation Accuracy: 0.8752
                     precision    recall  f1-score   support

                age       0.99      0.98      0.99      1598
          ethnicity       0.99      0.97      0.98      1592
             gender       0.88      0.91      0.90      1595
  not_cyberbullying       

In [9]:
# Section 7: BINARY CLASSIFICATION 

print("\n\n" + "="*60)
print("PART 2: BINARY CLASSIFICATION (Cyberbullying vs. Not Cyberbullying)")
print("="*60)

# Creating Binary Labels
# 0 for 'not_cyberbullying', 1 for all other types
not_cyberbullying_index = list(labEncoder.classes_).index('not_cyberbullying')
labels_binary = [0 if label == not_cyberbullying_index else 1 for label in labelsEncoded]
binary_class_names = ['not_cyberbullying', 'cyberbullying']

# Model Parameters for Binary Task
num_epochs_binary = 2 # Fewer epochs often suffice for binary tasks

# Data Splitting for Binary Task 
train_texts_b, val_texts_b, train_labels_b, val_labels_b, train_psych_b, val_psych_b = train_test_split(
    texts, labels_binary, psych_features_scaled, test_size=0.2, random_state=42, stratify=labels_binary
)

# Dataloader Setup for Binary Task
train_dataset_b = TextClassificationDataset(train_texts_b, train_labels_b, tokenizer, max_length, train_psych_b)
val_dataset_b = TextClassificationDataset(val_texts_b, val_labels_b, tokenizer, max_length, val_psych_b)

train_dataloader_b = DataLoader(train_dataset_b, batch_size=batch_size, shuffle=True)
val_dataloader_b = DataLoader(val_dataset_b, batch_size=batch_size)

# Model Initialization for Binary Task
model_binary = BERTWithPsychFeatures(bert_model_name, num_classes=2, num_psych_features=num_psych_features).to(device)
optimizer_b = AdamW(model_binary.parameters(), lr=learning_rate)
total_steps_b = len(train_dataloader_b) * num_epochs_binary
scheduler_b = get_linear_schedule_with_warmup(optimizer_b, num_warmup_steps=0, num_training_steps=total_steps_b)

# Initializing a new GradScaler for the binary model's training loop
scaler_b = GradScaler()

# Training Loop for Binary Model
for epoch in range(num_epochs_binary):
    print(f"\n--- Binary Epoch {epoch+1}/{num_epochs_binary} ---")
    
    # Using the optimized training function with the new scaler
    train_epoch_optimized(model_binary, train_dataloader_b, optimizer_b, scheduler_b, device, scaler_b)
    
    # Getting the raw predictions from the improved evaluate function
    actual_b, preds_b = evaluate(model_binary, val_dataloader_b, device)
    
    # Calculating accuracy and printing the report with proper target names
    accuracy = accuracy_score(actual_b, preds_b)
    report = classification_report(actual_b, preds_b, target_names=binary_class_names)
    
    print(f"Binary Validation Accuracy: {accuracy:.4f}")
    print(report)



PART 2: BINARY CLASSIFICATION (Cyberbullying vs. Not Cyberbullying)

--- Binary Epoch 1/2 ---
Binary Validation Accuracy: 0.8875
                   precision    recall  f1-score   support

not_cyberbullying       0.75      0.49      0.59      1589
    cyberbullying       0.90      0.97      0.93      7950

         accuracy                           0.89      9539
        macro avg       0.83      0.73      0.76      9539
     weighted avg       0.88      0.89      0.88      9539


--- Binary Epoch 2/2 ---
Binary Validation Accuracy: 0.8917
                   precision    recall  f1-score   support

not_cyberbullying       0.73      0.55      0.63      1589
    cyberbullying       0.91      0.96      0.94      7950

         accuracy                           0.89      9539
        macro avg       0.82      0.76      0.78      9539
     weighted avg       0.88      0.89      0.89      9539

