In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
train_df = pd.read_csv('preprocessed_dataset1.csv', on_bad_lines='skip', engine='python')

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['comment_text'], train_df['toxic'], test_size=0.2, random_state=42)

# Create a tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# Pad sequences
max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length)

# One-hot encode labels
y_train_cat = to_categorical(y_train)
y_val_cat = to_categorical(y_val)

# Build the model
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_length))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train_cat, epochs=3, batch_size=32, validation_data=(X_val_pad, y_val_cat))

# Evaluate the model
y_pred = model.predict(X_val_pad)
y_pred_class = np.argmax(y_pred, axis=1)
y_val_class = np.argmax(y_val_cat, axis=1)
print('Accuracy:', accuracy_score(y_val_class, y_pred_class))
print('Classification Report:')
print(classification_report(y_val_class, y_pred_class))

# Sample input and output
sample_text = ['you  are so kind']
sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=max_length)
prediction = model.predict(sample_pad)
print('Sample Input:', sample_text)
print('Output:', np.argmax(prediction, axis=1))



Epoch 1/3
[1m2736/2736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 15ms/step - accuracy: 0.9161 - loss: 0.2452 - val_accuracy: 0.9529 - val_loss: 0.1298
Epoch 2/3
[1m2736/2736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 14ms/step - accuracy: 0.9589 - loss: 0.1162 - val_accuracy: 0.9551 - val_loss: 0.1293
Epoch 3/3
[1m2736/2736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 17ms/step - accuracy: 0.9626 - loss: 0.0981 - val_accuracy: 0.9557 - val_loss: 0.1308
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Accuracy: 0.9556774046150331
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     19199
           1       0.89      0.73      0.80      2686

    accuracy                           0.96     21885
   macro avg       0.92      0.86      0.89     21885
weighted avg       0.95      0.96      0.95     21885

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 