<a href="https://colab.research.google.com/github/Faizul6/Detecting_Cyber_Bullying-Twitter_Tweets-Logistic_Regression-/blob/main/%7CDetecting_Cyber_Bullying%7CTwitter_Tweets%7CLogistic_Regression%7C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np

#basic preprocessing
import string
import nltk

#advanced preprocessing
from nltk.corpus import stopwords
from sklearn import preprocessing

#feature engineering
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

#train test split
from sklearn.model_selection import train_test_split

#importing logistic regression based on codes done in kaggle
from sklearn.linear_model import LogisticRegression

#using basic accuracy score to evaluate
from sklearn.metrics import accuracy_score

# Reading Files

In [2]:
df= pd.read_csv("cyberbullying_tweets.csv")
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [3]:
df.shape

(47692, 2)

# Basic Preprocessing

In [4]:
df['cyberbullying_type'].value_counts()

religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: cyberbullying_type, dtype: int64

In [5]:
#removing gender labeled columns
df=df.drop(index=range(7945,15921))
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [6]:
df['cyberbullying_type'].value_counts()

religion               7995
age                    7992
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: cyberbullying_type, dtype: int64

In [7]:
df.shape

(39716, 2)

In [8]:
# Checking if there is any null items
df.isnull().sum()

tweet_text            0
cyberbullying_type    0
dtype: int64

In [9]:
# Checking if there is any duplicate items
df.duplicated().sum()
#Fact:There is Duplicates,Removing these duplicates
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

# Advanced Preprocessing

In [10]:
#String Lowercasing
df['tweet_text']=df['tweet_text'].str.lower()
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"in other words #katandandre, your food was cra...",not_cyberbullying
1,why is #aussietv so white? #mkr #theblock #ima...,not_cyberbullying
2,@xochitlsuckkks a classy whore? or more red ve...,not_cyberbullying
3,"@jason_gio meh. :p thanks for the heads up, b...",not_cyberbullying
4,@rudhoeenglish this is an isis account pretend...,not_cyberbullying


In [11]:
#Removing Punctuation
exclude=string.punctuation
def remove_punc(text):
    return text.translate(str.maketrans(' ',' ',exclude))

df['tweet_text']=df['tweet_text'].apply(remove_punc)
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,in other words katandandre your food was crapi...,not_cyberbullying
1,why is aussietv so white mkr theblock imaceleb...,not_cyberbullying
2,xochitlsuckkks a classy whore or more red velv...,not_cyberbullying
3,jasongio meh p thanks for the heads up but no...,not_cyberbullying
4,rudhoeenglish this is an isis account pretendi...,not_cyberbullying


In [12]:
#Removing stopwords "English" as the data set is in English
nltk.download('stopwords')
sw_list=stopwords.words('english')
df['tweet_text']=df['tweet_text'].apply(lambda x:[item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,tweet_text,cyberbullying_type
0,words katandandre food crapilicious mkr,not_cyberbullying
1,aussietv white mkr theblock imacelebrityau tod...,not_cyberbullying
2,xochitlsuckkks classy whore red velvet cupcakes,not_cyberbullying
3,jasongio meh p thanks heads concerned another ...,not_cyberbullying
4,rudhoeenglish isis account pretending kurdish ...,not_cyberbullying


In [13]:
#Spliting into columns
x=df.iloc[:,0:1]
y=df['cyberbullying_type']

In [14]:
x.head()

Unnamed: 0,tweet_text
0,words katandandre food crapilicious mkr
1,aussietv white mkr theblock imacelebrityau tod...
2,xochitlsuckkks classy whore red velvet cupcakes
3,jasongio meh p thanks heads concerned another ...
4,rudhoeenglish isis account pretending kurdish ...


In [15]:
y.head()

0    not_cyberbullying
1    not_cyberbullying
2    not_cyberbullying
3    not_cyberbullying
4    not_cyberbullying
Name: cyberbullying_type, dtype: object

# Label Encoding

In [16]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Train-Test Split

In [17]:
X_train,X_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=1)
X_train.shape

(31764, 1)

# TF-IDF Vectorization

In [18]:
tfidf=TfidfVectorizer()
X_train_TFIDF=tfidf.fit_transform(X_train['tweet_text']).toarray()
X_test_TFIDF=tfidf.transform(X_test['tweet_text']).toarray()
X_train_TFIDF.shape

(31764, 50433)

# Logistic Regression Model

In [19]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_TFIDF,y_train)

# Making Predictions and Calculating basic Accuracy metrics to evaluate model

In [20]:
y_pred = model.predict(X_test_TFIDF)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8362926583553708
