In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

# Load the dataset from an Excel file
dataset = pd.read_excel('your_dataset.xlsx')

# Assuming 'Comment' is the column with text data
# And the rest of the columns are categories as one-hot encoded labels
text_column = 'Text'
label_columns = ["racial_hate", "Religious_caste_hate", "sexual_orientation_hate", "gender_based_hate",
                 "disability_hate", "political_hate", "social_caste_hate", "age_based_hate",
                 "nationality_hate", "appearance_based_hate", "none"]

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset[text_column], dataset[label_columns], test_size=0.2, random_state=42)

# Preprocess the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

max_sequence_length = max(max([len(sequence) for sequence in train_sequences]), max([len(sequence) for sequence in test_sequences]))
train_data = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_data = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Assuming the model structure can remain the same
model = Sequential()
model.add(Embedding(10000, 100, input_length=max_sequence_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_columns), activation='softmax'))  # Update the number of outputs to match the number of categories

model.compile(loss='categorical_crossentropy',  # Update the loss function for multi-label classification
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
model.fit(train_data, train_labels, validation_data=(test_data, test_labels), epochs=10, batch_size=32)

# Save the trained model and tokenizer
model.save('/content/hate_speech_classification_model_updated.h5')

tokenizer_json = tokenizer.to_json()
with open('/content/tokenizer_updated.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

# Note: For label decoding, since we're dealing with multi-label classification,
# the approach will differ. You might want to threshold the output probabilities to determine label assignments.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(test_data, test_labels)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Making predictions with the model
def make_predictions(text):
    # Preprocess the text
    encoded_text = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=max_sequence_length)

    # Predict
    predictions = model.predict(encoded_text)

    # Apply a threshold to each label
    threshold = 0.5
    labels = (predictions > threshold).astype(int)

    # Map back to label names
    predicted_labels = [label_columns[i] for i, label in enumerate(labels[0]) if label == 1]

    return predicted_labels

# Example usage
new_text = "Stats don`t represent the problem. Race baiting and attitude is. Who`s doing the crimes ? Ohh I bet Trayvon is still the little innocent boy too. Don`t speak with a lisp, a sure sign of a left wing democrat ! "
predicted_labels = make_predictions(new_text)
print(f"Predicted Labels for '{new_text}': {predicted_labels}")


Test Loss: 0.8999078869819641, Test Accuracy: 0.8299999833106995
Predicted Labels for 'Stats don`t represent the problem. Race baiting and attitude is. Who`s doing the crimes ? Ohh I bet Trayvon is still the little innocent boy too. Don`t speak with a lisp, a sure sign of a left wing democrat ! ': ['racial_hate']
