In [None]:
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Reading dataset as dataframe
df = pd.read_csv("/content/drive/MyDrive/cyberbullying_tweets.csv")
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window

In [None]:
df['cyberbullying_type'] = df['cyberbullying_type'].map({
    'not_cyberbullying': 0,
    'gender': 1,
    'religion': 1,
    'other_cyberbullying': 1,
    'age': 1,
    'ethnicity': 1
})

In [None]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was crapilicious! #mkr",0
1,Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc,0
2,@XochitlSuckkks a classy whore? Or more red velvet cupcakes?,0
3,"@Jason_Gio meh. :P thanks for the heads up, but not too concerned about another angry dude on twitter.",0
4,"@RudhoeEnglish This is an ISIS account pretending to be a Kurdish account. Like Islam, it is all lies.",0


In [None]:
def cleaner(summary):
    soup = BeautifulSoup(summary, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(#|@|http://|https://|www)\S*", " ", souped) # substituting hashtags, @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [None]:
df['cleaned_tweets'] = df.tweet_text.apply(cleaner)
df = df[df['cleaned_tweets'].map(len) > 0] # removing rows with cleaned summaries of length 0
print("Printing top 5 rows of dataframe showing original and cleaned summaries....")
print(df[['tweet_text','cleaned_tweets']].head())
df['cleaned_tweets'] = [" ".join(row) for row in df['cleaned_tweets'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_tweets']
Y = df['cyberbullying_type'] # target column
tfidf = TfidfVectorizer(min_df=.0005, ngram_range=(1,3)) # min_df=.0005 means that each ngram (unigram, bigram, & trigram) must be present in at least 30 documents for it to be considered as a token (60000*.0005=30). This is a clever way of feature engineering
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values
pd.DataFrame(pd.Series(tfidf.get_feature_names_out())).to_csv('tweets_vocabulary.csv', header=False, index=False)
print("Shape of tfidf matrix: ", data_tfidf.shape)

  soup = BeautifulSoup(summary, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
  soup = BeautifulSoup(summary, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'


Printing top 5 rows of dataframe showing original and cleaned summaries....
                                                                                                            tweet_text  \
0                                                        In other words #katandandre, your food was crapilicious! #mkr   
1  Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc   
2                                                         @XochitlSuckkks a classy whore? Or more red velvet cupcakes?   
3              @Jason_Gio meh. :P  thanks for the heads up, but not too concerned about another angry dude on twitter.   
4              @RudhoeEnglish This is an ISIS account pretending to be a Kurdish account.  Like Islam, it is all lies.   

                                                     cleaned_tweets  
0                                        [word, food, crapilicious]  
1                                                  

In [None]:
print("Implementing SVC.....")
# Implementing Support Vector Classifier
svc_clf = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    svc_clf.fit(X_train, Y_train) # Fitting SVC
    Y_pred = svc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
svc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", svc_mean_accuracy)

Implementing SVC.....
Iteration  1
Cross-validation accuracy:  0.8567493112947658
Iteration  2
Cross-validation accuracy:  0.8635304089849545
Iteration  3
Cross-validation accuracy:  0.8626827717736809
Iteration  4
Cross-validation accuracy:  0.8550540368722187
Iteration  5
Cross-validation accuracy:  0.8546302182665819
Iteration  6
Cross-validation accuracy:  0.859504132231405
Iteration  7
Cross-validation accuracy:  0.8548421275694003
Iteration  8
Cross-validation accuracy:  0.8554472233997457
Iteration  9
Cross-validation accuracy:  0.8598982619754133
Iteration  10
Cross-validation accuracy:  0.8552352691818568
Mean cross-validation accuracy:  0.8577573761550024


In [None]:
print("Implementing NBC.....")
# Implementing Naive Bayes Classifier
nbc_clf = MultinomialNB()

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
nbc_mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", nbc_mean_accuracy)

Implementing NBC.....
Iteration  1
Cross-validation accuracy:  0.8609874973511337
Iteration  2
Cross-validation accuracy:  0.8639542275905913
Iteration  3
Cross-validation accuracy:  0.8582326764144946
Iteration  4
Cross-validation accuracy:  0.8611994066539521
Iteration  5
Cross-validation accuracy:  0.8603517694426785
Iteration  6
Cross-validation accuracy:  0.8622589531680441
Iteration  7
Cross-validation accuracy:  0.8531468531468531
Iteration  8
Cross-validation accuracy:  0.8592623993217465
Iteration  9
Cross-validation accuracy:  0.8548113607460789
Iteration  10
Cross-validation accuracy:  0.8571428571428571
Mean cross-validation accuracy:  0.8591348000978429
