In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
data = pd.read_csv('cyberbullying_tweets.csv')
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [3]:
data.cyberbullying_type.value_counts()

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64

In [4]:
data.isnull().sum()

tweet_text            0
cyberbullying_type    0
dtype: int64

In [5]:
# Define preprocessing functions
lemma = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(['im', 'wa', 'p', 't', 's', 'o', 'e', 'like'])

def clean_text(text):
    pattern = re.compile(r"(#[A-Za-z0-9]+|@[A-Za-z0-9]+|https?://\S+|www\.\S+|\S+\.[a-z]+|RT @)")
    text = pattern.sub('', text)
    text = " ".join(text.split())
    text = text.lower()
    text = " ".join([lemma.lemmatize(word) for word in text.split()])
    remove_punc = re.compile(r"[%s]" % re.escape(string.punctuation))
    text = remove_punc.sub('', text)
    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])

    # emoji = demoji.findall(text)
    # for emot in emoji:
    #     text = re.sub(r"(%s)" % (emot), "_".join(emoji[emot].split()), text)

    return text


In [6]:
data.cyberbullying_type.unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [7]:
# Label encoding
ENCODE_DICT = {'not_cyberbullying': 0,
               'gender': 1,
               'religion': 2,
               'other_cyberbullying': 3,
               'age': 4,
               'ethnicity': 5}
data['cyberbullying_type'] = data.cyberbullying_type.replace(ENCODE_DICT)
print(data.cyberbullying_type.unique())
data.sample(10)

[0 1 2 3 4 5]


Unnamed: 0,tweet_text,cyberbullying_type
32235,I was a skinny SKINNY kid in high school but a...,4
18903,So u believe in an article written by “Amir Ta...,2
12296,Cele|bitchy | Miley Cyrus jokes about date rap...,1
13492,@LucyWalcott @nomuru2d @Bastille1790 I use the...,1
14465,RT @WeeTaengAE86: Dont like female superior......,1
27442,@SwiggyCares this is pathetic. What the hell i...,3
35342,"One time in high school I told a band girl I ""...",4
35606,can you read? she doesn’t actively use her pla...,4
17220,No my ancestors converted you idiot to escape ...,2
37922,If two parties take great glee trashing a char...,4


In [8]:
# Vectorization
CountVector = CountVectorizer(max_features=2000)
X = CountVector.fit_transform(data.tweet_text).toarray()
y = data.cyberbullying_type.values
print(X.shape, y.shape)

(47692, 2000) (47692,)


In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=555)
print(f"X train data has shape {X_train.shape} and their label's shape {y_train.shape}")
print(f"X test data has shape {X_test.shape} and their label's shape {y_test.shape}")

X train data has shape (38153, 2000) and their label's shape (38153,)
X test data has shape (9539, 2000) and their label's shape (9539,)


In [10]:
# Train the model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [11]:
y_pred = rf.predict(X_test)
print('Accuracy Score: %f' %(accuracy_score(y_pred, y_test)))

Accuracy Score: 0.816752


In [12]:
# Generate classification report for the test data
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.50      0.53      1585
           1       0.91      0.82      0.87      1538
           2       0.96      0.96      0.96      1650
           3       0.54      0.65      0.59      1582
           4       0.97      0.98      0.98      1567
           5       0.98      0.98      0.98      1617

    accuracy                           0.82      9539
   macro avg       0.82      0.82      0.82      9539
weighted avg       0.82      0.82      0.82      9539

