In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/datasets/CyberBullying Comments Dataset.csv')

In [None]:
# Data exploration
print(df.head())
print("\nMissing values:\n", df.isnull().sum())
print("\nClass distribution:\n", df['CB_Label'].value_counts())

                                                Text  CB_Label
0  damn there is someones nana up here at beach w...         0
1  no kidding! dick clark was a corpse mechanical...         0
2  i read an article on jobros and thought damn w...         0
3  I got one fucking day of sprinkles and now it'...         0
4  I was already listening to Elliott smith  and ...         0

Missing values:
 Text        0
CB_Label    0
dtype: int64

Class distribution:
 CB_Label
0    5550
1    5550
Name: count, dtype: int64


We can see that both the columns do not contain missing values, and the data is balanced

In [None]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['cleaned_text'] = df['Text'].apply(preprocess_text)
print(df.head())

                                                Text  CB_Label  \
0  damn there is someones nana up here at beach w...         0   
1  no kidding! dick clark was a corpse mechanical...         0   
2  i read an article on jobros and thought damn w...         0   
3  I got one fucking day of sprinkles and now it'...         0   
4  I was already listening to Elliott smith  and ...         0   

                                        cleaned_text  
0  damn someones nana beach one dont think ic ste...  
1  kidding dick clark corpse mechanically operate...  
2  read article jobros thought damn cash jobro po...  
3  got one fucking day sprinkles back sunshine do...  
4  already listening elliott smith fucking hate k...  


In [None]:
# Split data
X = X
y = df['CB_Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train) # BoW component
X_test_tfidf = vectorizer.transform(X_test)

Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Train Naive Bayes
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train)
y_pred_nb = nb_clf.predict(X_test_tfidf)

# Evaluation
print("Naive Bayes Performance:")
print(classification_report(y_test, y_pred_nb))

Naive Bayes Performance:
              precision    recall  f1-score   support

           0       0.70      0.73      0.71      1128
           1       0.71      0.67      0.69      1092

    accuracy                           0.70      2220
   macro avg       0.70      0.70      0.70      2220
weighted avg       0.70      0.70      0.70      2220



In [None]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression
lr_clf = LogisticRegression(max_iter=1000, random_state=42)
lr_clf.fit(X_train_tfidf, y_train)
y_pred_lr = lr_clf.predict(X_test_tfidf)

# Evaluation
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.71      0.78      0.74      1128
           1       0.74      0.67      0.71      1092

    accuracy                           0.73      2220
   macro avg       0.73      0.72      0.72      2220
weighted avg       0.73      0.73      0.72      2220

