In [66]:
!pip install --upgrade numpy pandas matplotlib seaborn gensim scikit-learn tqdm nltk scipy joblib gdown --quiet

In [67]:
import zipfile
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
file_id = "1qf2VHJfHMNzUKpy7KxkadqLb8zWAqxDD"
output_path = "/content/"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)
print("Download complete!")


In [None]:
!unzip cyberbullying_dataset.zip -d ./

In [None]:
imported_df = pd.read_csv('/content/cyberbullying_dataset.csv')
imported_df.columns = imported_df.columns.str.lower()
imported_df.head()

In [None]:
imported_df.shape

# **Exploratory Data Analysis**

In [None]:
imported_df.info()

In [None]:
# dropping the 'miscellaneous' column as it has a lot of null values
new_df = imported_df.drop(columns=['miscellaneous'])
new_df.columns

In [None]:
# check if labels are balanced or imbalanced
new_df['label'].value_counts()

In [None]:
new_df['label'] = new_df['label'].replace({'hatespeech': 'offensive'})
new_df['label'].value_counts()

# **EDA =>  Univariant Analysis**

**Conclusions**
- Data is slightly imbalanced with two labels Normal and Offensive having ratio of 38.9%, 61.1% repectively
- Gender, Religion and Sexual Orientation columns have more than 75% cells with unspecified information
- Race column might have correlation with target labels as the ratio of unspecific data is less than 70% or the ratio of specific data is more than 30%.

In [None]:
new_df.columns

In [None]:
new_df['label'].value_counts()

In [None]:
new_df['race'].value_counts()

In [None]:
new_df['religion'].value_counts()

In [None]:
new_df['gender'].value_counts()

In [None]:
new_df['sexual orientation'].value_counts()

In [None]:
categories = {
    "Labels": new_df['label'].value_counts(),
    "Gender": new_df['gender'].value_counts(),
    "Race": new_df['race'].value_counts(),
    "Religion": new_df['religion'].value_counts(),
    "Sexual Orientation": new_df['sexual orientation'].value_counts()
}

fig, ax = plt.subplots(5, 2, figsize=(14, 25))

for i, (category, data) in enumerate(categories.items()):
    data.plot(kind='bar', ax=ax[i, 0])
    ax[i, 0].set_title(f"{category} Distribution")
    ax[i, 0].set_xlabel("")
    ax[i, 0].tick_params(axis='x', rotation=0)
    ax[i, 1].pie(data, labels=data.index, autopct='%1.1f%%', startangle=90)

plt.tight_layout()
plt.show()

# **EDA => Bivariant Analysis**

- We can see clear correlation of different columns with target label
- In gender the ratio of hate speech used by Women is much greater as compared to Men
- In religion column, Jewish, Hindu, Islam has more percentage of offensive comments as compared to normal comments
- Ratio of offensive comments given by Indians, Arabs and Africans is higher
- In orientation, Bi-sexuals have higher hate speech ratio as compared to normal comments

In [None]:
gender_label_relation = pd.crosstab(new_df['label'], new_df['gender'], normalize='columns') * 100
gender_label_relation

In [None]:
religion_label_relation = pd.crosstab(new_df['label'], new_df['religion'], normalize='columns') * 100
religion_label_relation

In [None]:
race_label_relation = pd.crosstab(new_df['label'], new_df['race'], normalize='columns') * 100
race_label_relation

In [None]:
orientation_label_relation = pd.crosstab(new_df['label'], new_df['sexual orientation'], normalize='columns') * 100
orientation_label_relation

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(14, 12))

sns.heatmap(gender_label_relation, ax=ax[0, 0], cmap='Blues')
ax[0, 0].set_title("Gender vs Label")

sns.heatmap(race_label_relation, ax=ax[0, 1], cmap='Blues')
ax[0, 1].set_title("Race vs Label")

sns.heatmap(religion_label_relation, ax=ax[1, 0], cmap='Blues')
ax[1, 0].set_title("Religion vs Label")

sns.heatmap(orientation_label_relation, ax=ax[1, 1], cmap='Blues')
ax[1, 1].set_title("Sexual Orientation vs Label")

plt.tight_layout()
plt.show()

# **Preprocessing and Feature Engineering**
- basic preprocesing
- tokenization
- combine all columns into 1 column

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


print(preprocess_text("it's just testing. I am Muhammad Hassaan Maqbool conducting the test"))

In [None]:
processed_df = new_df.copy()
processed_df['comment'] = processed_df['comment'] + ' ' + processed_df['race'] + ' ' + processed_df['religion'] + ' ' + processed_df['gender'] + ' ' + processed_df['sexual orientation']
processed_df = processed_df[['comment', 'label']]
processed_df['comment'] = processed_df['comment'].apply(preprocess_text)
processed_df.head()

In [None]:
processed_df['label'] = processed_df['label'].map({'normal': 0, 'offensive': 1})
processed_df['label'].value_counts()

# **Modeling and Evaluation**

In [None]:
from scipy.sparse import csr_matrix

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [None]:
def train_and_evaluate(model, X_train, X_test, y_train, y_test, just_evaluate = False):
    if(just_evaluate == False):
      model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print("-" * 50)

    return accuracy, conf_matrix, precision

In [None]:
def tune_model_random_search(model, param_dist, x_train, y_train, n_iter=10, cv=3, n_jobs=-1, verbose=2, random_state=42):
    random_search = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_dist,
                                       n_iter=n_iter,
                                       cv=cv,
                                       verbose=verbose,
                                       random_state=random_state,
                                       n_jobs=n_jobs)
    random_search.fit(x_train, y_train)
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    return best_model, best_params

In [None]:
def train_and_evaluate_all_models(x_train, x_test, y_train, y_test):
  models = {
  'rfc' : RandomForestClassifier(n_jobs=-1),
  'lg' : LogisticRegression(n_jobs=-1),
  'svc' : SVC(),
  'gnb' : GaussianNB(),
  'xgb' : XGBClassifier(n_jobs=-1)
  }

  for model_name, model in tqdm(models.items(), desc="Training models", total=len(models)):
      print(f"Evaluating model: {model.__class__.__name__}")
      if model_name == 'gnb':
            x_train_dense = x_train.toarray()
            x_test_dense = x_test.toarray()
            train_and_evaluate(model, x_train_dense, x_test_dense, y_train, y_test, False)
      else:
          train_and_evaluate(model, x_train, x_test, y_train, y_test, False)

# **Using TF-IDF Vectorizer**
**Conclusions**
- XGBoost, RandomForest and SVC performed really well with accuracy and precision between 84% to 87%
- Performance of Naive Bayes was poor with accuracy and percision of approximately 50% and 69% respectively
- Considering their initial performance, tried Hyperparameter tuning on XGBoost, Random Forest and SVC. Multiple iterations were tried with randomized hyperparameters to find the best combination, but couldn't find anymore improvements

In [None]:
tfidf = TfidfVectorizer()

x, y = tfidf.fit_transform(processed_df['comment']), processed_df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
train_and_evaluate_all_models(x_train, x_test, y_train, y_test)

**Hyperparameter Tuning**
Performing tuning on the following algorigthms
- Random Forest Classifier
- XGBoost
- SVC

**Random Forest Hyperparameter Tuning**
- tried multiple iteration on random parameters but didn't see any improvement as compared to default one

In [None]:
param_rfc = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_features': [None, 'sqrt'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

best_model_rfc, best_params_rfc = tune_model_random_search(RandomForestClassifier(), param_rfc, x_train, y_train)

print("Best Parameters:", best_params_rfc)

train_and_evaluate(best_model_rfc, x_train, x_test, y_train, y_test, True)

**XGBoost Hyperparameter Tuning**

In [None]:
param_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.3]
}

best_model_xgb, best_params_xgb = tune_model_random_search(XGBClassifier(), param_xgb, x_train, y_train)

print("Best Parameters:", best_params_xgb)

train_and_evaluate(best_model_xgb, x_train, x_test, y_train, y_test, True)

**SVC Hyperparameter Tuning**

In [None]:
param_svc = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4],
    'class_weight': [None, 'balanced']
}

best_model_svc, best_params_svc = tune_model_random_search(SVC(), param_svc, x_train, y_train)

print("Best Parameters:", best_params_svc)

train_and_evaluate(best_model_svc, x_train, x_test, y_train, y_test, True)

# **Using Word2Vec**
**Conclusion**
- Couldn't find any improvements as compared to TF-IDF approach
- Maximum accuracy and precision achieved are 80% and 83% respectively, meanwhile for TF-IDF it accuracy and precisions were more than 84%

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
def get_comment_vector_from_words(comment, model):
  words = [word for word in comment if word in model.wv]
  if len(words) == 0:
    return np.zeros(model.vector_size)
  return np.mean([model.wv[word] for word in words], axis=0)

In [None]:
word2vec_df = processed_df[['comment', 'label']]
word2vec_df['comment'] = word2vec_df['comment'].apply(word_tokenize)
word2vec_df.head()

In [None]:
model = Word2Vec(sentences = word2vec_df['comment'], window=5, min_count=1, workers=4, sg=1)

In [None]:
word2vec_df['wv_vectors'] = word2vec_df['comment'].apply(lambda x: get_comment_vector_from_words(x, model))

In [None]:
x, y = csr_matrix(np.vstack(word2vec_df['wv_vectors'])), word2vec_df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=64)

In [None]:
train_and_evaluate_all_models(x_train, x_test, y_train, y_test)

# **GloVe (Twitter Pretrained Model)**
**Conclusion**
- Couldn't see any more improvements even after using both Twitter and Google News Corpus Pretrained Models.
- TF-IDF out performed customer Word2Vec and GloVe

In [None]:
def get_comment_vector_from_glove(comment, embeddings):
  words = [word for word in comment if word in embeddings]
  if len(words) == 0:
    return np.zeros(model.vector_size)
  return np.mean([embeddings[word] for word in words], axis=0)

In [None]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

**Twitter 27B Corpus**

In [None]:
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

In [None]:
!unzip glove.twitter.27B.zip -d ./glove_twitter

In [None]:
twitter_glove_path = "glove_twitter/glove.twitter.27B.100d.txt"
twitter_glove_embeddings = load_glove_embeddings(twitter_glove_path)

print(f"Loaded {len(twitter_glove_embeddings)} word vectors.")

In [None]:
word2vec_df['glove_twitter_vectors'] = word2vec_df['comment'].apply(lambda x: get_comment_vector_from_glove(x, twitter_glove_embeddings))

In [None]:
x, y = csr_matrix(np.vstack(word2vec_df['glove_twitter_vectors'])), word2vec_df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=64)

In [None]:
train_and_evaluate_all_models(x_train, x_test, y_train, y_test)

**Google 300B Corpus**

In [None]:
import gdown

file_id = "1elKVMmcGpvNZJvxwg9enc2B_lOSCjP1t"
output_path = "/content/GoogleNews-vectors-negative300.bin.zip"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)
print("Download complete!")

In [None]:
!unzip GoogleNews-vectors-negative300.bin.zip -d ./

In [None]:
google_new_model_path = "GoogleNews-vectors-negative300.bin"
google_news_model = gensim.models.KeyedVectors.load_word2vec_format(google_new_model_path, binary=True)

In [None]:
google_news_model.most_similar("king")

In [None]:
word2vec_df['glove_google_news_vectors'] = word2vec_df['comment'].apply(lambda x: get_comment_vector_from_glove(x, google_news_model))

In [None]:
x, y = csr_matrix(np.vstack(word2vec_df['glove_twitter_vectors'])), word2vec_df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=64)

In [None]:
train_and_evaluate_all_models(x_train, x_test, y_train, y_test)