## Classifying Cyberbullying Tweets using Machine Learning 

In [1]:
#Load Cyberbullying Dataset 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn import metrics


#show all results
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

df1 = pd.read_csv('cyberbullying_tweets.csv')
#df1 = df1.sample(frac=.1) #reduce file size and shuffle rows to retrieve all cyberbullying types back to test
#df1=df1.drop(df1.index[20000:]) #reduces file size to test code
print(df1.columns)
df1.info()

Index(['tweet_text', 'cyberbullying_type'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47692 non-null  object
 1   cyberbullying_type  47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB


In [2]:
df1.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [3]:
df1.cyberbullying_type.unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [4]:
#convert df1 cyberbullying label to .replace 
df1['cyberbullying_type'].replace(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying', 'age', 'ethnicity'],
                        [0, 1, 2, 3,4,5], inplace=True)

In [5]:
from langdetect import detect, detect_langs

#append language 
def det(x):
    try:
        lang = detect(x)
    except:
        lang = 'Other'
    return lang

df1['Lang'] = df1['tweet_text'].apply(det)

In [None]:
df1.head()

### Data Preprocessing

In [6]:
#cleaning of data

#remove stopwords
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#lower case and punctuation removal 
import re, string 

def strip_all_entities(tweet_text): 
    tweet_text = tweet_text.replace('\r', '').replace('\n', ' ').lower() #remove \n and \r and lowercase
    tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text) #remove links and mentions
    tweet_text = re.sub(r'[^\x00-\x7f]',r'', tweet_text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation
    table = str.maketrans('', '', banned_list)
    tweet_text = tweet_text.translate(table)
    tweet_text = [word for word in tweet_text.split() if word not in stop_words]
    tweet_text = ' '.join(tweet_text)
    tweet_text =' '.join(word for word in tweet_text.split() if len(word) < 14) # remove words longer than 14 characters
    return tweet_text


tweet_lst = []
for t in df1.tweet_text:
    tweet_lst.append(strip_all_entities(t)) 


df1 = df1.drop_duplicates() #drops duplicates 

df_en = df1[df1['Lang'] == 'en'] #drop none english tweets 

tweet_lst = df_en['tweet_text'].to_list() 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
#tokenize. Preparing the data. https://www.kaggle.com/jonaspptawat/cyberbullying-classification-eda-and-ml#Data-Cleaning
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

def stemmer(tweet_lst):
    tokenized = nltk.word_tokenize(tweet_lst)
    PS = PorterStemmer()
    return ' '.join([PS.stem(words) for words in tokenized])

#Lemmatization 
#NOTE:Stemming seems to work better for this dataset
def lemmatize(tweet_lst):
    tokenized = nltk.word_tokenize(tweet_lst)
    lm = WordNetLemmatizer()
    return ' '.join([lm.lemmatize(words) for words in tokenized])

cv = CountVectorizer(max_features = 2500)

x = cv.fit_transform(tweet_lst).toarray()
y = df_en['cyberbullying_type'].to_list()

print(x.shape)


(44605, 2500)


In [8]:
yArray = np.array(y)
yReshape = yArray.reshape(-1,1)

In [9]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(yReshape)

In [10]:
#split the dataset, train and test sets     

from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 14)
x_trainS, x_testS, y_trainS, y_testS = train_test_split(x, onehot_encoded, test_size = 0.20, random_state = 14)

### Naive Bayes

In [11]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
GN= GaussianNB()
GN.fit(x_train, y_train)

print ("Training set accuracy: {:.4f}". format(GN.score(x_train, y_train)))
print ("Test set accuracy: {:.4f}". format(GN.score(x_test, y_test)))

#confusion matrix 
y_pred= GN.predict(x_test)
from sklearn.metrics import confusion_matrix 
confusion_matrix= confusion_matrix (y_test, y_pred)
print(confusion_matrix)

#precision and recall 
from sklearn.metrics import classification_report 
print (classification_report(y_test, y_pred))


GaussianNB()

Training set accuracy: 0.6846
Test set accuracy: 0.6319
[[ 247   37   55  897   37   32]
 [ 109  759   58  478   54   81]
 [ 204   33 1249   22   47   78]
 [  81   34   15 1133   48   33]
 [ 101   28   19  423 1012   25]
 [  12    9   68   84   82 1237]]
              precision    recall  f1-score   support

           0       0.33      0.19      0.24      1305
           1       0.84      0.49      0.62      1539
           2       0.85      0.76      0.81      1633
           3       0.37      0.84      0.52      1344
           4       0.79      0.63      0.70      1608
           5       0.83      0.83      0.83      1492

    accuracy                           0.63      8921
   macro avg       0.67      0.62      0.62      8921
weighted avg       0.69      0.63      0.63      8921



### K-Neighbors

In [12]:
#K-neighbors

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(x_train, y_train)



#Confusion Matrix
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

# Precision/Recall
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))



KNeighborsClassifier()

[[ 761   49   10  470   12    3]
 [ 361  899    7  261    4    7]
 [ 544   66  680  324    7   12]
 [ 598   50    5  688    2    1]
 [ 188   14   27  121 1255    3]
 [ 187   21   33  153    8 1090]]
              precision    recall  f1-score   support

           0       0.29      0.58      0.39      1305
           1       0.82      0.58      0.68      1539
           2       0.89      0.42      0.57      1633
           3       0.34      0.51      0.41      1344
           4       0.97      0.78      0.87      1608
           5       0.98      0.73      0.84      1492

    accuracy                           0.60      8921
   macro avg       0.72      0.60      0.62      8921
weighted avg       0.74      0.60      0.64      8921



### Tensorflow

In [13]:
#text classification via TensorFlow

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten

model = Sequential()
model.add(Dense(5, input_shape=[2500], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(600, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(300, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))

model.compile(loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

x_trainS=np.asarray(x_trainS)


#model.save_weights('model.h5')


In [15]:
model.fit(x_trainS, y_trainS,
    batch_size=32,
    epochs=10,
    verbose=1,
    validation_split=0.1,
    shuffle=True)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17c7fd001c0>

In [16]:
y_pred = model.predict(x_test)
y_predDecode = np.argmax(y_pred, axis=1) 

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_predDecode)
print(confusion_matrix)

[[1097    2   56  150    0    0]
 [ 589  870    4   75    0    1]
 [ 109   10 1501   12    0    1]
 [1173    1    5  165    0    0]
 [1097    1    0  117  393    0]
 [  22   20    1  183    0 1266]]
