In [1]:
######################################################################### Data fetching

import pandas as pd

# Assuming your CSV file is named 'data.csv' and is in the same directory as your script
file_name = 'dataset.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_name)

# Select only the "Comment" and "Hateful" columns
selected_df = df[['Comment', 'Hateful']]

# Display the selected DataFrame
print(selected_df)


                                                 Comment  Hateful
0      Damn I thought they had strict gun laws in Ger...        0
1      I dont care about what it stands for or anythi...        0
2                      It's not a group it's an idea lol        0
3                              So it's not just America!        0
4      The dog is a spectacular dancer considering he...        0
...                                                  ...      ...
27778  you's a muthaf***in lie &#8220;@LifeAsKing: @2...        1
27779  you've gone and broke the wrong heart baby, an...        0
27780  young buck wanna eat!!.. dat nigguh like I ain...        1
27781              youu got wild bitches tellin you lies        1
27782  ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...        0

[27783 rows x 2 columns]


In [2]:
##########################################################################################33Data Preprocessing

# Create X and Y directly from the DataFrame
X = selected_df['Comment'].values
y = selected_df['Hateful'].values
# Display X and y (optional)
# print("X (Comments):", X)
# print("y (Hateful Labels):", y)


In [3]:
# from keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize the Tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the comments
tokenizer.fit_on_texts(X)

# Convert the comments to sequences of tokens
X_tokenized = tokenizer.texts_to_sequences(X)

# Display the tokenized sequences (optional)
print("Tokenized sequences:")
print(X_tokenized)


Tokenized sequences:
[[153, 2, 324, 43, 135, 14208, 1131, 1984, 11, 3433], [2, 160, 312, 66, 63, 25, 2708, 24, 73, 361, 161, 9329, 7, 2, 16, 3, 14209], [74, 48, 1, 1222, 74, 100, 1044, 62], [41, 74, 48, 36, 643], [3, 486, 12, 1, 14210, 14211, 3434, 64, 154, 276, 475, 1045], [30, 653, 160, 489, 4931, 6, 1182, 30, 653, 489, 4931, 51, 438, 1182], [72, 195, 590, 77, 34, 108, 7, 2897], [4293, 7, 102, 30, 74, 9330, 86, 3, 818, 2096, 2898, 25, 53, 9330, 86, 3, 818], [78, 1, 2899, 2, 5788, 23], [531, 596, 17, 49, 5789, 7, 1646, 389, 34, 41, 410], [614, 3, 14212, 12, 36, 78, 1524, 78, 3, 1525], [23, 12, 3, 953, 72, 34, 88], [537, 3, 262, 930, 235, 506, 1, 262, 62], [43, 46, 3, 4294, 17, 643], [2, 71, 9, 309], [2900, 494, 2097], [99, 3, 309, 14213], [32, 378, 184, 43, 14214, 14215, 251], [131, 7120, 1046, 11, 537, 12, 1291, 363], [63, 1, 3435, 581, 1, 2901, 1327, 7, 91, 2517], [2, 1817, 451, 3, 819, 154, 1, 325, 7, 65, 28, 3800, 2518, 2902, 914], [85, 310, 1, 720, 635, 597, 339, 24, 361, 14216],

In [4]:
from sklearn.model_selection import train_test_split

# Prepare your data
X_train, X_test, y_train, y_test = train_test_split(X_tokenized, y, test_size=0.2, random_state=42)

# Display the shapes of the train and test sets (optional)
print("X_train shape:", len(X_train))
print("X_test shape:", len(X_test))
print("y_train shape:", len(y_train))
print("y_test shape:", len(y_test))



X_train shape: 22226
X_test shape: 5557
y_train shape: 22226
y_test shape: 5557


In [5]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Pad sequences to ensure uniform length
max_sequence_length = 50  # Adjust as needed
X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length)


In [6]:
##################################################################################### Modelling

# Define the neural network architecture
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train, batch_size=64, epochs=5, validation_data=(X_test_padded, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)


# Save the trained model
# model.save('trained_model.h5')

# Download the saved model to your local machine
#files.download('trained_model.h5')




Epoch 1/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 131ms/step - accuracy: 0.8415 - loss: 0.3561 - val_accuracy: 0.9529 - val_loss: 0.1330
Epoch 2/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 102ms/step - accuracy: 0.9702 - loss: 0.0871 - val_accuracy: 0.9493 - val_loss: 0.1433
Epoch 3/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 119ms/step - accuracy: 0.9892 - loss: 0.0366 - val_accuracy: 0.9354 - val_loss: 0.1740
Epoch 4/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 124ms/step - accuracy: 0.9944 - loss: 0.0197 - val_accuracy: 0.9300 - val_loss: 0.2563
Epoch 5/5
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 125ms/step - accuracy: 0.9963 - loss: 0.0123 - val_accuracy: 0.9313 - val_loss: 0.2786
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.9301 - loss: 0.2849


In [8]:
model.save_weights('model.weights.h5') 

In [9]:
model_json = model.to_json()
with open('model_config.json', 'w') as json_file:
    json_file.write(model_json)

In [10]:
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [11]:

with open('training_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

In [11]:
# ####################################################################################### Testing Input

# # Define a function to preprocess the input string
# def preprocess_input_string(input_string, tokenizer, max_sequence_length):
#     # Tokenize the input string
#     input_sequence = tokenizer.texts_to_sequences([input_string])
#     # Pad the input sequence
#     input_sequence_padded = pad_sequences(input_sequence, maxlen=max_sequence_length)
#     return input_sequence_padded

# # Define a function to get the result from the model
# def get_model_prediction(model, input_sequence):
#     # Predict the class label (0 or 1) for the input sequence
#     prediction = model.predict(input_sequence)
#     # Interpret the prediction (e.g., thresholding)
#     result = "Hate Speech" if prediction > 0.5 else "Non-Hate Speech"
#     return result


# # Example input string

# # input_string = "Alhamdulillah"
# input_string = "Niggas"

# # Preprocess the input string
# input_sequence = preprocess_input_string(input_string, tokenizer, max_sequence_length)

# # Get the result from the model
# result = get_model_prediction(model, input_sequence)

# print("Input:", input_string)
# print("Result:", result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Input: Niggas
Result: Hate Speech
