In [None]:
#Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Load the data
df = pd.read_csv(r'path\to\cyberbullying_data.csv')

# Split the data
X = df['text'].values
y = df['is_cyberbullying'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Tokenize the text
max_words = 10000  # You can adjust this
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_length = 100  # You can adjust this based on your data
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Convert labels to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")

In [None]:
# Define the model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D

def create_model(vocab_size, embed_size, max_length):
    inputs = Input(shape=(max_length,))
    x = Embedding(vocab_size, embed_size, input_length=max_length)(inputs)
    x = Dropout(0.25)(x)
    x = Bidirectional(LSTM(embed_size, return_sequences=True))(x)
    
    # Simple attention mechanism
    attention = Dense(1, activation='tanh')(x)
    attention = GlobalMaxPooling1D()(attention)
    attention = Dense(max_length, activation='softmax')(attention)
    
    x = x * attention[:, :, None]
    x = GlobalMaxPooling1D()(x)
    x = Dropout(0.5)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create and compile the model
embed_size = 100  # You can adjust this
model = create_model(vocab_size, embed_size, max_length)
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_padded, y_train,
    epochs=10,  # You can adjust this
    batch_size=32,  # You can adjust this
    validation_split=0.2,
    callbacks=[early_stopping]
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test accuracy: {accuracy:.4f}")

In [None]:
# Make predictions
predictions = model.predict(X_test_padded)
predicted_labels = (predictions > 0.5).astype(int).flatten()

# Print classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted_labels))

In [None]:
# Function to predict on new text
def predict_cyberbullying(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = model.predict(padded)[0][0]
    return "Cyberbullying" if prediction > 0.5 else "Not Cyberbullying", prediction

# Example usage
text = "Hahaha what a loser!"
label, confidence = predict_cyberbullying(text)
print(f"Text: {text}")
print(f"Prediction: {label}")
print(f"Confidence: {confidence:.4f}")

In [None]:
# Plot the model performance
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Save the model
model.save('trained_model.keras')
print('Model saved!')

In [None]:
# Save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load the model
new_model = tf.keras.models.load_model('path\to\trained_model.keras')

In [None]:
# Load tokenizer
with open('path\to\tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Function to predict on new text
def predict_cyberbullying(text, max_length):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = new_model.predict(padded)[0][0]
    return "Cyberbullying" if prediction > 0.5 else "Not Cyberbullying", prediction

In [None]:
# Example usage
text = "Go and cry in a corner"
label, confidence = predict_cyberbullying(text, max_length=100)
print(f"Text: {text}")
print(f"Prediction: {label}")
print(f"Confidence: {confidence:.4f}")