# Import essential libraries

In [4]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import joblib
import warnings
import gc
import wget

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Input, Dropout, BatchNormalization
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

from wordcloud import WordCloud

# Suppress warnings
warnings.filterwarnings('ignore')

# Download the helper script
wget.download("https://raw.githubusercontent.com/yogawicaksana/helper_prabowo/main/helper_prabowo_ml.py", out="../Artifacts/helper_prabowo_ml.py")

# Import the helper functions
from helper_prabowo_ml import (
    clean_html, punct, remove_digits, remove_links, 
    remove_special_characters, remove_, removeStopWords, 
    lower, email_address, non_ascii
)

# Data Exploration

In [None]:
df = pd.read_csv('../Dataset/OriginalDataset.csv')
df = df.sample(n=20000).reset_index(drop=True)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

# Exploratory Data Analysis

In [None]:
df['num_words'] = df['text'].apply(len)
df['num_words'].describe()

In [None]:
sns.countplot(x='label', data=df, palette='Set2')
plt.title('Class Distribution')
plt.xlabel('Categories')
plt.ylabel('Count')
plt.show()

In [None]:
df['tweet_length'] = df['text'].apply(len)

sns.histplot(df['tweet_length'], kde=True, color='purple')
plt.title('Tweet Distribution Analysis')
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.boxplot(x='label', y='tweet_length', data=df, palette='Set3')
plt.title('Tweet Distribution Per Category')
plt.xlabel('Categories')
plt.ylabel('Tweet Length')
plt.show()

In [None]:
max_length = 85

In [None]:
def generate_wordcloud_2x2(df):
    categories = df['label'].unique()
    fig, axs = plt.subplots(2, 2, figsize=(10, 6))

    axs = axs.flatten()
    for i, category in enumerate(categories):
        category_data = df[df['label'] == category]['text']
        wc = WordCloud(width=800, height=400, max_words=200, background_color='white').generate(' '.join(category_data))
        axs[i].imshow(wc, interpolation='bilinear')
        axs[i].set_title(f'WORD CLOUD FOR {category}')
        axs[i].axis('off')

    plt.tight_layout()
    plt.show()

generate_wordcloud_2x2(df)

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_features=20)
X = vectorizer.fit_transform(df['text'])

common_words = vectorizer.get_feature_names_out()
word_counts = X.sum(axis=0).A1
common_word_df = pd.DataFrame({'word': common_words, 'count': word_counts})

sns.barplot(x='count', y='word', data=common_word_df.sort_values(by='count', ascending=False), palette='viridis')
plt.title('Top 20 Most Common Words In Tweets')
plt.xlabel('Count')
plt.ylabel('Words')
plt.show()

# Data Preprocessing

In [None]:
def clean_text(data,col):
    data[col] = data[col].apply(func=remove_)
    data[col] = data[col].apply(func=remove_digits)
    data[col] = data[col].apply(func=remove_links)
    data[col] = data[col].apply(func=remove_special_characters)
    data[col] = data[col].apply(func=removeStopWords)
    data[col] = data[col].apply(func=punct)
    data[col] = data[col].apply(func=email_address)
    data[col] = data[col].apply(func=non_ascii)
    data[col] = data[col].apply(func=clean_html)
    data[col] = data[col].apply(func=lower)
    return data

In [None]:
preprocessed_df = clean_text(df,'text')
preprocessed_df.head()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sreeniketh/cyberbullying_sentiment_dsce_2023')
distilbert = TFAutoModelForSequenceClassification.from_pretrained('sreeniketh/cyberbullying_sentiment_dsce_2023',from_pt=True)

# Data Preparation

In [None]:
train_df, test_df = train_test_split(preprocessed_df,test_size=0.3,random_state=101)

# Tokenize text data

In [None]:
max_length = 85

X_train = tokenizer(text=train_df['text'].tolist(),
                   max_length=max_length,
                   padding=True,
                   truncation=True,
                   add_special_tokens=True,
                   return_tensors='tf',
                   return_attention_mask=True,
                   return_token_type_ids=False,
                   verbose=1)

X_test = tokenizer(text=test_df['text'].tolist(),
                  max_length=max_length,
                  padding=True,
                  truncation=True,
                  add_special_tokens=True,
                  return_tensors='tf',
                  return_attention_mask=True,
                  return_token_type_ids=False,
                  verbose=1)

# Encode labels and store mapping  

In [None]:
encoder = LabelEncoder()
train_df.label = encoder.fit_transform(train_df.label)
test_df.label = encoder.transform(test_df.label)

encoded_labels = dict()

for idx, label in enumerate(encoder.classes_):
    encoded_labels[idx] = label

encoded_labels

# Model inputs

In [None]:
input_ids = Input(shape=(max_length,),name='input_ids',dtype=tf.int32)
attention_mask = Input(shape=(max_length,),name='attention_mask',dtype=tf.int32)

# Extract embeddings from DistilBERT model

In [None]:
embeddings = distilbert(input_ids,attention_mask=attention_mask)[0] # 0 -> final hidden state output, 1 -> pooling output

# Classification layers

In [None]:
output = Flatten()(embeddings)
output = Dense(units=128,activation='relu')(output)
output = BatchNormalization()(output)
output = Dropout(0.3)(output)
output = Dense(units=64,activation='relu')(output)
output = BatchNormalization()(output)
output = Dropout(0.2)(output)
output = Dense(units=32,activation='relu')(output)
output = BatchNormalization()(output)
output = Dropout(0.1)(output)
output = Dense(units=4,activation='softmax')(output)

# Define and compile model

In [None]:
model = Model(inputs=[input_ids,attention_mask],outputs=output)
model.layers[2].trainable = True
model.summary()

# Visualize model architecture

In [None]:
plot_model(model,to_file='model.png',show_shapes=True,dpi=100)

# Optimizer and class weights

In [None]:
adam = AdamW(learning_rate=5e-5, weight_decay=0.01, epsilon=2e-7, clipnorm=1.0)
class_weights = {0: 0.3, 1: 0.25, 2: 0.15, 3: 0.3}
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=adam,
    metrics=[SparseCategoricalAccuracy()]
)

# Learning rate decay function

In [None]:
def time_based_decay(epoch, lr):
    decay_rate = 1e-5
    decay_epoch = 10
    return lr * (1 / (1 + decay_rate * epoch / decay_epoch))

# Callbacks

In [None]:
es = EarlyStopping(monitor='val_sparse_categorical_accuracy',mode='max',patience=5,restore_best_weights=True,verbose=1)
ls = LearningRateScheduler(time_based_decay,verbose=1)
mc = ModelCheckpoint(filepath='cyberbullying_classifier.keras',monitor='val_sparse_categorical_accuracy',save_best_only=True,mode='max',verbose=1)

# Model Training

In [None]:
r = model.fit(x={'input_ids': X_train['input_ids'], 'attention_mask': X_train['attention_mask']},
              y=train_df.label,
              batch_size=256,
              epochs=15,
              class_weight=class_weights,
              validation_data=({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},test_df.label),
              callbacks=[es,ls,mc]
             )

# Plot training loss and accuracy

In [None]:
plt.figure(figsize=(10,6))
plt.plot(r.history['loss'],'r',label='Train Loss')
plt.plot(r.history['val_loss'],'b',label='Test Loss')
plt.xlabel('Number of Epochs')
plt.ylabel('Loss')
plt.title('Loss Graph')
plt.legend();

In [None]:
plt.figure(figsize=(10,6))
plt.plot(r.history['sparse_categorical_accuracy'],'r',label='Train Accuracy')
plt.plot(r.history['val_sparse_categorical_accuracy'],'b',label='Test Accuracy')
plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Graph')
plt.legend();

# Evaluate model performance

In [None]:
loss, acc = model.evaluate({'input_ids': X_test['input_ids'],'attention_mask': X_test['attention_mask']},test_df.label)
print("Sparse Categorical Crossentropy Loss:", round(loss,2))
print("Sparse Categorical Accuracy:", round(acc*100,2))

In [None]:
test_predictions = model.predict({'input_ids': X_test['input_ids'],'attention_mask': X_test['attention_mask']})
test_predictions = np.argmax(test_predictions, axis=1)

cm = confusion_matrix(test_df.label, test_predictions)

plt.figure(figsize=(10, 6))
heatmap = sns.heatmap(cm, annot=True, fmt='d', cmap='viridis', xticklabels=True, yticklabels=True)
colorbar = heatmap.collections[0].colorbar
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

In [None]:
print("Classification Report:")
print(classification_report(test_df.label,test_predictions))

# Save essential files

In [None]:
preprocessed_df.to_csv('../Dataset/PreprocessedDataset.csv', index=False)
print("Preprocessed dataframe saved as CSV")

with open('../Artifacts/X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)
print("X_train saved as pickle")

with open('../Artifacts/X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)
print("X_test saved as pickle")

with open('../Artifacts/LabelEncoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)
print("Label encoder saved using pickle")

model.save('../Artifacts/CyberbullyingClassifier.keras')
print("Model saved as .keras")

with open('../Artifacts/TrainingHistory.pkl', 'wb') as f:
    pickle.dump(r.history, f)
print("Training history saved as pickle")

# Load essential files

In [None]:
preprocessed_df = pd.read_csv('../Dataset/PreprocessedDataset.csv')
print("Preprocessed dataframe loaded from CSV")

with open('../Artifacts/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)
print("X_train loaded from pickle")

with open('../Artifacts/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
print("X_test loaded from pickle")

with open('../Artifacts/LabelEncoder.pkl', 'rb') as f:
    encoder = pickle.load(f)
print("Label encoder loaded using pickle")

with open('../Artifacts/TrainingHistory.pkl', 'rb') as f:
    training_history = pickle.load(f)
print("Training history loaded from pickle")

In [None]:
from tensorflow.keras.models import load_model
from transformers import TFDistilBertForSequenceClassification

custom_objects = {
    'TFDistilBertForSequenceClassification': TFDistilBertForSequenceClassification
}

model = load_model('../Artifacts/CyberbullyingClassifier.keras', custom_objects=custom_objects)

model.summary()