In [12]:
import pandas as pd
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np

# Download NLTK assets
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Load necessary CSV files
cb_labels_df = pd.read_csv("/content/A Comprehensive Dataset for Automated Cyberbullying Detection/A Comprehensive Dataset for Automated Cyberbullying Detection/CB_Labels.csv")
users_df = pd.read_csv("/content/A Comprehensive Dataset for Automated Cyberbullying Detection/A Comprehensive Dataset for Automated Cyberbullying Detection//users_data.csv")
communication_df = pd.read_csv("/content/A Comprehensive Dataset for Automated Cyberbullying Detection/A Comprehensive Dataset for Automated Cyberbullying Detection//Communication_Data_Among_Users.csv")


In [3]:
# Merge user demographic info into CB_Labels
merged_df = cb_labels_df.merge(users_df, how='left', left_on='User1 ID', right_on='UserID', suffixes=('', '_User1'))
merged_df = merged_df.merge(users_df, how='left', left_on='User2 ID', right_on='UserID', suffixes=('', '_User2'))
merged_df.drop(columns=['UserID', 'UserID_User2'], inplace=True)

# Count number of messages and aggressive messages per (User1, User2)
communication_df['Message'] = communication_df['Message'].astype(str)
message_counts = communication_df.groupby(['User1 ID', 'User2 ID']).agg(
    num_messages=('Message', 'count'),
    num_aggressive=('Label', 'sum')
).reset_index()

# Merge back with demographic-enriched dataset
merged_df = merged_df.merge(message_counts, on=['User1 ID', 'User2 ID'], how='left')
merged_df[['num_messages', 'num_aggressive']] = merged_df[['num_messages', 'num_aggressive']].fillna(0)

In [4]:
merged_df.head()

Unnamed: 0,User1 ID,User2 ID,Total_messages,Aggressive_Count,Intent_to_Harm,Peerness,CB_Label,Age,Gender,School Name,Grade,Age_User2,Gender_User2,School Name_User2,Grade_User2,num_messages,num_aggressive
0,1,2,36,23,0.769444,0.5,1,11,Others,School10,5,15,Male,School5,9,36,23
1,1,3,16,9,0.48125,0.766667,1,11,Others,School10,5,11,Male,School13,5,16,9
2,1,4,17,8,0.447794,0.366667,0,11,Others,School10,5,17,Male,School14,11,17,8
3,1,5,24,15,0.6125,0.366667,0,11,Others,School10,5,17,Male,School6,11,24,15
4,1,6,15,7,0.420833,0.433333,1,11,Others,School10,5,16,Male,School12,10,15,7


# Pre-processing

In [5]:
# NLP Preprocessing Functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r"[^a-z\s]", '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = clean_text(text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [6]:
# Concatenate messages per dyad and preprocess
communication_df['Clean_Message'] = communication_df['Message'].apply(preprocess)
dyad_messages = communication_df.groupby(['User1 ID', 'User2 ID'])['Clean_Message'].apply(lambda msgs: ' '.join(msgs)).reset_index()

# Merge with CB_Labels to attach final labels
lstm_input_df = pd.merge(dyad_messages, cb_labels_df[['User1 ID', 'User2 ID', 'CB_Label']], on=['User1 ID', 'User2 ID'], how='inner')


In [7]:
lstm_input_df.head()

Unnamed: 0,User1 ID,User2 ID,Clean_Message,CB_Label
0,1,2,bye bye dear bajaj got better work watching ur...,1
1,1,3,bother fuckface suck cock dear gwernol u usele...,1
2,1,4,ron paul ron paul hasnt withdrawn thought id l...,0
3,1,5,vandalism disagree limey frustrating today cra...,0
4,1,6,rt fanwalker sexistbut theory female unable te...,1


# Duplicates and Missing Values

In [8]:
# How many reviews do we have?
print('There are', lstm_input_df.shape[0], 'data in this dataset')

# Do we have duplicates?
print('Number of Duplicates:', len(lstm_input_df[lstm_input_df.duplicated()]))

# Do we have missing values?
missing_values = lstm_input_df.isnull().sum()
print('Number of Missing Values by column:\n',missing_values)

print('Number of Missing Values:', lstm_input_df.isnull().sum().sum())

There are 9511 data in this dataset
Number of Duplicates: 0
Number of Missing Values by column:
 User1 ID         0
User2 ID         0
Clean_Message    0
CB_Label         0
dtype: int64
Number of Missing Values: 0
