In [30]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import random
import ssl
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from textblob import TextBlob
import re


In [31]:
df = pd.read_csv("cyberbullying_tweets.csv")

In [32]:
def clean_tweet_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbol
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')]) # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) # Lemmatization
    return text

df['cleaned_tweet_text'] = df['tweet_text'].apply(clean_tweet_text)
df[['tweet_text', 'cleaned_tweet_text']].head()
df.drop("tweet_text", axis=1, inplace=True)

In [33]:
le = LabelEncoder()
df['cyberbullying_type_encoded'] = le.fit_transform(df['cyberbullying_type'])
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
# Print the mapping
print(class_mapping)

{'age': 0, 'ethnicity': 1, 'gender': 2, 'not_cyberbullying': 3, 'other_cyberbullying': 4, 'religion': 5}


In [34]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2)
X = vectorizer.fit_transform(df['cleaned_tweet_text'])
Y = df['cyberbullying_type_encoded']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [36]:
dtrain = xgb.DMatrix(X_train, label=y_train)
params = {
    'objective': 'multi:softmax',
    'num_class': 6,
    'min_child_weight' : 1,
    'max_depth': 10,
    'eta': 0.3,
    'subsample': 1,
    'colsample_bytree': 0.7
}
epochs = 10
bst = xgb.train(params, dtrain, epochs)

In [37]:
dtest = xgb.DMatrix(X_test, label=y_test)
predictions = bst.predict(dtest)

In [38]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 82.22%


In [None]:
print(classification_report(y_test, predictions, target_names=class_mapping))

                     precision    recall  f1-score   support

                age       0.99      0.97      0.98      1603
          ethnicity       0.99      0.98      0.98      1603
             gender       0.91      0.80      0.85      1531
  not_cyberbullying       0.74      0.36      0.48      1624
other_cyberbullying       0.53      0.90      0.67      1612
           religion       0.97      0.93      0.95      1566

           accuracy                           0.82      9539
          macro avg       0.85      0.82      0.82      9539
       weighted avg       0.85      0.82      0.82      9539

