In [5]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load datasets
twitter_data = pd.read_csv('Hate_speech_folder/twitter_parsed_dataset.csv')
aggressive_data = pd.read_csv('Hate_speech_folder/aggression_parsed_dataset.csv')
kaggle_data = pd.read_csv('Hate_speech_folder/kaggle_parsed_dataset.csv')
youtube_parsed =pd.read_csv('Hate_speech_folder/youtube_parsed_dataset.csv')
attack_data = pd.read_csv("Hate_speech_folder/attack_parsed_dataset.csv")
toxic_data = pd.read_csv("Hate_speech_folder/toxicity_parsed_dataset.csv")

# Combine datasets (assuming they all have 'Text' and 'oh_label' columns)
combined_data = pd.concat([twitter_data[['Text', 'oh_label']],
                            aggressive_data[['Text', 'oh_label']],
                            youtube_parsed[['Text', 'oh_label']],
                            toxic_data[['Text', 'oh_label']],
                            attack_data[['Text', 'oh_label']],
                            kaggle_data[['Text', 'oh_label']]])

# Handle NaN values in 'oh_label'
combined_data['oh_label'].fillna(0, inplace=True)
combined_data['oh_label'] = combined_data['oh_label'].astype(int)

# Tokenize and pad text data
texts = combined_data['Text'].astype(str).values
labels = combined_data['oh_label'].values

max_words = 10000  # Adjust as needed
max_len = 100      # Adjust as needed

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Split the combined data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')  # Binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=3, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

# Function to predict if a text is offensive
def predict_offensive(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return 'Offensive' if prediction[0][1] > 0.5 else 'Not Offensive'

# Example prediction
print(predict_offensive('Fuck you'))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['oh_label'].fillna(0, inplace=True)


Epoch 1/3




[1m8411/8411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 21ms/step - accuracy: 0.9088 - loss: 0.2444 - val_accuracy: 0.9310 - val_loss: 0.1946
Epoch 2/3
[1m8411/8411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 17ms/step - accuracy: 0.9347 - loss: 0.1707 - val_accuracy: 0.9391 - val_loss: 0.1588
Epoch 3/3
[1m8411/8411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 15ms/step - accuracy: 0.9408 - loss: 0.1553 - val_accuracy: 0.9398 - val_loss: 0.1599
[1m2629/2629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9423 - loss: 0.1551
Test Loss: 0.15797485411167145
Test Accuracy: 0.9412170648574829
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
Offensive


In [6]:
model.save("Cyber_bullying_detection_model.keras")

In [7]:
import pickle

# Tokenize and pad text data
texts = combined_data['Text'].astype(str).values
labels = combined_data['oh_label'].values

max_words = 10000  # Adjust as needed
max_len = 100      # Adjust as needed

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Save the tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
