<a href="https://colab.research.google.com/github/meghan-a1/CyberBullyingDetection/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
import warnings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [11]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# Loading the dataset
data = pd.read_csv('/content/drive/MyDrive/Dataset.csv')

# Extract tweets and labels
tweets = data['Comment'].values
labels = data['label'].values  # 1 for cyberbullying, 0 for not

In [13]:
# Initialize the tokenizer and fit it on the tweets
tweets=data['Comment'].astype(str).fillna('')
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweets)

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(tweets)

# Pad sequences to ensure uniform input size with pre-padding
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')

# Assign X and y
X = padded_sequences
y = labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
# LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=X.shape[1]))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Train the model on the training data
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 71ms/step - accuracy: 0.7547 - loss: 0.5209 - val_accuracy: 0.8476 - val_loss: 0.3431
Epoch 2/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 72ms/step - accuracy: 0.8881 - loss: 0.2696 - val_accuracy: 0.8458 - val_loss: 0.3460
Epoch 3/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 70ms/step - accuracy: 0.9126 - loss: 0.2154 - val_accuracy: 0.8404 - val_loss: 0.3676
Epoch 4/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 81ms/step - accuracy: 0.9257 - loss: 0.1822 - val_accuracy: 0.8378 - val_loss: 0.4102
Epoch 5/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 69ms/step - accuracy: 0.9371 - loss: 0.1546 - val_accuracy: 0.8284 - val_loss: 0.4384
Epoch 6/10
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 70ms/step - accuracy: 0.9462 - loss: 0.1365 - val_accuracy: 0.8312 - val_loss: 0.5215
Epoch 7/10
[1m4

In [18]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.8240 - loss: 0.7650
Test Accuracy: 0.8224


In [17]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary predictions

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step
Confusion Matrix:
[[1085  443]
 [ 445 3027]]

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.71      0.71      1528
           1       0.87      0.87      0.87      3472

    accuracy                           0.82      5000
   macro avg       0.79      0.79      0.79      5000
weighted avg       0.82      0.82      0.82      5000

