In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:

file_path = '/content/cyberbullying_tweets.csv'
df = pd.read_csv(file_path)


df.head()


Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [3]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words and word.isalnum()]
    return ' '.join(words)

# Apply the preprocessing to the 'tweet_text' column
df['cleaned_text'] = df['tweet_text'].apply(preprocess_text)

# Display the first few rows after preprocessing
df[['tweet_text', 'cleaned_text']].head()


Unnamed: 0,tweet_text,cleaned_text
0,"In other words #katandandre, your food was cra...",words katandandre food crapilicious mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,aussietv white mkr theblock imacelebrityau tod...
2,@XochitlSuckkks a classy whore? Or more red ve...,xochitlsuckkks classy whore red velvet cupcakes
3,"@Jason_Gio meh. :P thanks for the heads up, b...",meh p thanks heads concerned another angry dud...
4,@RudhoeEnglish This is an ISIS account pretend...,rudhoeenglish isis account pretending kurdish ...


In [4]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(df['cleaned_text'])

# Define the target variable
y = df['cyberbullying_type']  # Assuming 'cyberbullying_type' is the target column

# Encode target labels if necessary
y = pd.get_dummies(y, drop_first=True)

# Display the shape of X and y
X.shape, y.shape


((47692, 5000), (47692, 5))

In [6]:
# Define the target variable as a single column
y = df['cyberbullying_type']  # No need for pd.get_dummies

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000, multi_class='auto', solver='lbfgs')

# Train the model
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Classification Report
print(classification_report(y_test, y_pred))


                     precision    recall  f1-score   support

                age       0.96      0.97      0.97      1603
          ethnicity       0.98      0.97      0.97      1603
             gender       0.90      0.82      0.86      1531
  not_cyberbullying       0.60      0.58      0.59      1624
other_cyberbullying       0.61      0.69      0.65      1612
           religion       0.95      0.94      0.94      1566

           accuracy                           0.83      9539
          macro avg       0.83      0.83      0.83      9539
       weighted avg       0.83      0.83      0.83      9539



In [7]:
# Classification Report
print(classification_report(y_test, y_pred))


                     precision    recall  f1-score   support

                age       0.96      0.97      0.97      1603
          ethnicity       0.98      0.97      0.97      1603
             gender       0.90      0.82      0.86      1531
  not_cyberbullying       0.60      0.58      0.59      1624
other_cyberbullying       0.61      0.69      0.65      1612
           religion       0.95      0.94      0.94      1566

           accuracy                           0.83      9539
          macro avg       0.83      0.83      0.83      9539
       weighted avg       0.83      0.83      0.83      9539



In [8]:
import joblib

# Save the trained model
joblib.dump(model, 'cyberbullying_detection_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']