In [3]:
import pandas as pd
import numpy as np
import re
# Load the dataset
url = "https://raw.githubusercontent.com/AsukaaNao/datasets/refs/heads/main/cyberbullying_tweets.csv"
data = pd.read_csv(url)

categories=data['cyberbullying_type'].unique()
print(categories)

['not_cyberbullying' 'gender' 'religion' 'other_cyberbullying' 'age'
 'ethnicity']


In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE

# Load the dataset
url = "https://raw.githubusercontent.com/AsukaaNao/datasets/refs/heads/main/cyberbullying_tweets.csv"
data = pd.read_csv(url)
data['tweet_text'] = data['tweet_text'].astype(str)

# Text preprocessing
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove non-alphabetical characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    return " ".join(text)

data['tweet_text'] = data['tweet_text'].apply(preprocess_text)

# Extract content and label
contents = data['tweet_text']
labels = data['cyberbullying_type']

# Define categories
categories = ['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying', 'age', 'ethnicity']

# Encode labels
labels = pd.Categorical(labels, categories=categories).codes

# Convert to one-hot encoding
labels = to_categorical(labels)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(contents, labels, test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences
max_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Check class distribution
print(f"Class distribution : {np.bincount(y_train.argmax(axis=1))}")

# # Oversample minority class
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train)

# # Check new class distribution
# print(f"Class distribution after SMOTE: {np.bincount(y_train_resampled.argmax(axis=1))}")

# Class weights for imbalanced dataset
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(len(categories)),
    y=y_train.argmax(axis=1)
)
class_weights = dict(enumerate(class_weights))
print(f"Class Weights: {class_weights}")

# Build the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(categories), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Early stopping (uncomment if you want to stop early based on validation loss)
# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_padded, y_train,
    # X_train_resampled, y_train_resampled,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_padded, y_test),
    class_weight=class_weights,
    # callbacks=[early_stopping],
    verbose=2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test, verbose=2)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Prediction function
def predict_cyberbullying(content):
    content = preprocess_text(content)
    seq = tokenizer.texts_to_sequences([content])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    prediction = model.predict(padded)
    return categories[np.argmax(prediction)]

# Test the prediction function
sample_text = "You are so dumb and useless!"
print(f"Prediction for sample text: {predict_cyberbullying(sample_text)}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Class distribution : [6321 6442 6432 6211 6389 6358]
Class Weights: {0: 1.005985339872383, 1: 0.987089930663355, 2: 0.9886245854063018, 3: 1.0238018569205174, 4: 0.9952783429853393, 5: 1.0001310684701687}




Epoch 1/20
1193/1193 - 73s - 61ms/step - accuracy: 0.7508 - loss: 0.5957 - val_accuracy: 0.8218 - val_loss: 0.4452
Epoch 2/20
1193/1193 - 69s - 58ms/step - accuracy: 0.8553 - loss: 0.3728 - val_accuracy: 0.8373 - val_loss: 0.4226
Epoch 3/20
1193/1193 - 71s - 60ms/step - accuracy: 0.8823 - loss: 0.3055 - val_accuracy: 0.8337 - val_loss: 0.4428
Epoch 4/20
1193/1193 - 70s - 59ms/step - accuracy: 0.8974 - loss: 0.2637 - val_accuracy: 0.8270 - val_loss: 0.5145
Epoch 5/20
1193/1193 - 70s - 59ms/step - accuracy: 0.9079 - loss: 0.2317 - val_accuracy: 0.8265 - val_loss: 0.5570
Epoch 6/20
1193/1193 - 72s - 61ms/step - accuracy: 0.9149 - loss: 0.2144 - val_accuracy: 0.8237 - val_loss: 0.5784
Epoch 7/20
1193/1193 - 75s - 63ms/step - accuracy: 0.9210 - loss: 0.1917 - val_accuracy: 0.8216 - val_loss: 0.6197
Epoch 8/20
1193/1193 - 72s - 61ms/step - accuracy: 0.9261 - loss: 0.1737 - val_accuracy: 0.8180 - val_loss: 0.7220
Epoch 9/20
1193/1193 - 68s - 57ms/step - accuracy: 0.9301 - loss: 0.1645 - val_a

In [5]:
# Prediction function
def predict_cyberbullying(content):
    content = preprocess_text(content)
    seq = tokenizer.texts_to_sequences([content])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    prediction = model.predict(padded)
    predicted_label_index = np.argmax(prediction)
    return categories[predicted_label_index]

In [19]:
# Test the prediction function
sample_text = "idiot sandwich"
print(f"Prediction for sample text: {predict_cyberbullying(sample_text)}")

sample_text = "moslem terrorist stfu"
print(f"Prediction for sample text: {predict_cyberbullying(sample_text)}")

sample_text = "bruh fucking retartd"
print(f"Prediction for sample text: {predict_cyberbullying(sample_text)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Prediction for sample text: other_cyberbullying
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Prediction for sample text: religion
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Prediction for sample text: other_cyberbullying


# data as x_test (blm ganti)

In [7]:
# import pandas as pd
# import numpy as np
# import re
# import nltk
# from sklearn.preprocessing import LabelEncoder
# from sklearn.utils.class_weight import compute_class_weight
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
# from tensorflow.keras.callbacks import EarlyStopping

# # Load the dataset
# url = "https://raw.githubusercontent.com/AsukaaNao/datasets/refs/heads/main/trolling_data.csv"
# data = pd.read_csv(url)
# data['content'] = data['content'].astype(str)

# # Text preprocessing
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stop_words = set(stopwords.words('english'))

# def preprocess_text(text):
#     text = re.sub(r"http\S+", "", text)  # Remove URLs
#     text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove non-alphabetical characters
#     text = text.lower()  # Convert to lowercase
#     text = text.split()  # Tokenize
#     text = [word for word in text if word not in stop_words]  # Remove stopwords
#     return " ".join(text)

# data['content'] = data['content'].apply(preprocess_text)

# # Extract content and label
# contents = data['content']
# labels = data['label']

# # Encode labels
# label_encoder = LabelEncoder()
# labels = label_encoder.fit_transform(labels)

# # Tokenization
# tokenizer = Tokenizer(num_words=10000)
# tokenizer.fit_on_texts(contents)

# contents_seq = tokenizer.texts_to_sequences(contents)

# # Padding sequences
# max_length = 100
# contents_padded = pad_sequences(contents_seq, maxlen=max_length, padding='post')

# # Class weights for imbalanced dataset
# class_weights = compute_class_weight(
#     class_weight='balanced',
#     classes=np.unique(labels),
#     y=labels
# )
# class_weights = dict(enumerate(class_weights))
# print(f"Class Weights: {class_weights}")

# # Build the model
# model = Sequential([
#     Embedding(input_dim=10000, output_dim=128, input_length=max_length),
#     Bidirectional(LSTM(128, return_sequences=False)),
#     Dropout(0.5),
#     Dense(64, activation='relu'),
#     Dropout(0.5),
#     Dense(1, activation='sigmoid')
# ])

# # Compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Print model summary
# model.summary()

# # Early stopping
# # early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# # Train the model using the entire dataset as training data
# history = model.fit(
#     contents_padded, labels,
#     epochs=20,
#     batch_size=32,
#     class_weight=class_weights,
#     # callbacks=[early_stopping],
#     verbose=2
# )

# # Evaluate the model using the same data
# test_loss, test_accuracy = model.evaluate(contents_padded, labels, verbose=2)
# print(f"Test Loss: {test_loss}")
# print(f"Test Accuracy: {test_accuracy}")

# # Predict on the entire dataset
# predictions = model.predict(contents_padded)
# predicted_labels = ["Trolling" if pred > 0.5 else "Not Trolling" for pred in predictions.flatten()]

# # Save the predictions as a DataFrame
# predictions_df = pd.DataFrame({
#     "Content": data['content'],
#     "Actual Label": ["Trolling" if lbl == 1 else "Not Trolling" for lbl in labels],
#     "Predicted Label": predicted_labels
# })

# # Save the DataFrame to a CSV file
# predictions_df.to_csv("trolling_predictions.csv", index=False)
# print("Predictions have been saved to 'trolling_predictions.csv'.")


In [None]:
# # Export the model
# model.save("cyberv1_detection_model.h5")
# print("Model saved successfully!")

In [9]:
# import pickle

# # Save the tokenizer
# with open("tokenizerv1.pkl", "wb") as f:
#     pickle.dump(tokenizer, f)
# print("Tokenizer saved successfully!")

In [10]:
# print(data['label'].value_counts())

# import model

In [11]:
# import pickle
# # Load the tokenizer
# with open("tokenizerv1.pkl", "rb") as f:
#     tokenizer = pickle.load(f)
# print("Tokenizer loaded successfully!")


In [12]:
# from tensorflow.keras.models import load_model
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Load the model
# loaded_model = load_model("cyberv1_detection_model.h5")
# print("Model loaded successfully!")

# # Test prediction using the loaded model
# sample_text = "I think this is a bad idea."
# prediction = loaded_model.predict(pad_sequences(tokenizer.texts_to_sequences([sample_text]), maxlen=100, padding='post'))
# print("Prediction:", "Cyberbullying" if prediction[0][0] > 0.5 else "Not Cyberbullying")


# Predict

In [13]:
# # Test the prediction function
# sample_text = "you are handsome"
# print(f"Prediction for sample text: {predict_trolling(sample_text)}")

# sample_text = "fuck you, i hate you so much"
# print(f"Prediction for sample text: {predict_trolling(sample_text)}")


# sample_text = "you are not handsome"
# print(f"Prediction for sample text: {predict_trolling(sample_text)}")

In [14]:
# # Sample tweets with and without cursing
# tweets = [
#     "you are handsome",
#     "you are not handsome",
#     "You're such a loser, I can't believe anyone likes you.",
#     "Wow, that was an amazing game last night!",
#     "Why the hell would you even say that? You're so stupid.",
#     "This is the best day of my life. Thank you all for making it special!",
#     "Shut the fuck up already!",
#     "I just got promoted at work! Feeling so proud.",
#     "What an awful, pathetic excuse of a human you are.",
#     "The weather today is absolutely beautiful. Loving it!",
#     "You're a complete idiot, no one cares about you.",
#     "Celebrating my anniversary with my wonderful partner tonight. Love you!",
#     "What the fuck were you thinking, moron?",
#     "Feeling grateful for my family and friends.",
#     "I hate you so much, you absolute piece of garbage.",
#     "Just finished a marathon! So tired but feeling great.",
#     "This place is a fucking disaster. Who runs this dump?",
#     "Went hiking today and saw the most beautiful sunset.",
#     "You're so annoying, I wish you'd disappear forever.",
#     "Baking cookies for my friends. Can't wait to share them!",
#     "Can't believe how stupid you are, get a life.",
#     "Going to the movies tonight with friends. So excited!",
#     "You're so fucking dumb, it hurts my brain to listen to you.",
#     "Had a great meeting today at work. Feeling inspired.",
#     "Why don't you just shut your damn mouth already?",
#     "Feeling so blessed for this opportunity. Thank you, everyone!",
#     "You really fucked it up this time, didn't you?",
#     "The flowers in my garden are blooming beautifully this season.",
#     "You're a worthless piece of shit, just go away.",
#     "Enjoying a quiet evening reading my favorite book.",
#     "What the hell is wrong with you, seriously?",
#     "Had an amazing workout session today! Feeling great.",
#     "Fuck you and everything you stand for.",
#     "Loving this new recipe I tried. It's so delicious!",
#     "You're a joke, and nobody takes you seriously.",
#     "Spent the day volunteering at the animal shelter. So rewarding!",
#     "Why the fuck would anyone think you're competent?",
#     "Catching up on my favorite TV series tonight.",
#     "You're just a sad excuse for a person.",
#     "Had a fantastic time at the concert last night!",
#     "You're fucking pathetic, just quit already.",
#     "Grateful for all the good things happening in my life right now.",
#     "You're full of shit, and everyone knows it.",
#     "Taking my dog for a walk in the park. Such a peaceful evening.",
#     "What the fuck is your problem? Get a clue.",
#     "So happy to announce that I'll be starting my dream job next month!",
#     "You're a fucking embarrassment to everyone around you.",
#     "Planning a surprise birthday party for my best friend. Can't wait!",
#     "Nobody gives a shit about you or your stupid opinions.",
#     "Had the best pizza ever today. Life is good.",
#     "Fuck off and leave me alone, asshole.",
#     "Can't wait to see my family this weekend!",
#     "You're the worst kind of person, and I can't stand you.",
#     "Learning a new skill today. Feeling productive and happy.",
#     "You're a fucking waste of space.",
#     "Visited the museum today and saw some amazing artwork.",
#     "You're so full of yourself, it's fucking hilarious.",
#     "Spent the afternoon gardening. It was so relaxing.",
#     "You're a dumbass, and everyone knows it.",
#     "Excited to start a new project at work tomorrow.",
#     "Why don't you just fuck off already?",
#     "Had a wonderful picnic by the lake with friends.",
#     "You're fucking useless and always will be.",
#     "Just finished painting my room. It looks amazing!",
#     "You're a fucking liar, and nobody trusts you.",
#     "Feeling so accomplished after completing that big task today.",
#     "Why the fuck do you even try? You're terrible.",
#     "So proud of my little brother for graduating today!",
#     "You're a selfish piece of shit, and everyone hates you.",
#     "Made a new friend today at the park. Feeling great.",
#     "You're a fucking idiot, and it's embarrassing to know you.",
#     "Excited to attend the conference next week. Lots to learn!",
#     "You're a complete asshole, and everyone knows it.",
#     "Took some beautiful photos of nature during my hike today.",
#     "Why are you so fucking incompetent? It's infuriating.",
#     "Enjoyed a lovely dinner with my family tonight.",
#     "You're the biggest fucking moron I've ever met.",
#     "Started a new workout plan today. Feeling motivated!",
#     "You're so full of shit, it's ridiculous.",
#     "Attended a fantastic workshop on personal growth today.",
#     "Why don't you just shut the fuck up already?",
#     "Feeling happy and at peace with my life right now.",
#     "You're a piece of shit, and no one likes you.",
#     "Had a great time exploring the city with my best friend.",
#     "What the fuck is wrong with you? You're insufferable.",
#     "Baked some homemade bread today. It smells amazing.",
#     "You're such a fucking hypocrite, it's unreal.",
#     "Feeling relaxed after a long day at the spa.",
#     "You're a complete piece of shit, and nobody wants you around.",
#     "Spent the evening watching the stars. So beautiful and calming.",
#     "You're the most annoying fucking person I've ever dealt with.",
#     "Planning my dream vacation. Can't wait to travel again!"
# ]


In [15]:
# import pandas as pd
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Convert tweets to a DataFrame for better handling
# tweets_df = pd.DataFrame(tweets, columns=["content"])

# # Tokenize and pad sequences
# sequences = tokenizer.texts_to_sequences(tweets_df['content'])
# padded_sequences = pad_sequences(sequences, maxlen=100, padding='post')

# # Make predictions
# predictions = loaded_model.predict(padded_sequences)

# # Add predictions to the DataFrame
# tweets_df['prediction'] = ["Cyberbullying" if pred > 0.5 else "Not Cyberbullying" for pred in predictions]

# # Print results
# print(tweets_df)


In [16]:
# # Save the DataFrame to a CSV file
# tweets_df.to_csv("tweet_predictions.csv", index=False)

# print("Tweets and predictions have been saved to 'tweet_predictions.csv'.")
