In [None]:
!kaggle datasets download -d julian3833/jigsaw-toxic-comment-classification-challenge

Dataset URL: https://www.kaggle.com/datasets/julian3833/jigsaw-toxic-comment-classification-challenge
License(s): CC0-1.0
Downloading jigsaw-toxic-comment-classification-challenge.zip to /content
 79% 42.0M/53.4M [00:00<00:00, 90.9MB/s]
100% 53.4M/53.4M [00:00<00:00, 100MB/s] 


In [None]:
!unzip /content/jigsaw-toxic-comment-classification-challenge.zip

Archive:  /content/jigsaw-toxic-comment-classification-challenge.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: test_labels.csv         
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/train.csv')

In [None]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
df.shape

(159571, 8)

In [None]:
toxic_1 = df[df['toxic'] == 1]
toxic_0 = df[df['toxic'] == 0]


toxic_1_sample = toxic_1.sample(n=1000, random_state=42)
toxic_0_sample = toxic_0.sample(n=1000, random_state=42)

df = pd.concat([toxic_1_sample, toxic_0_sample])


df= df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())

                 id                                       comment_text  toxic  \
0  37c6fb098a585ee5  "\n\n Non-canonical Oz works by ""Royal Histor...      0   
1  a3d98cdb97254fb0  Just because you enjoy sucking an occasional n...      1   
2  ed0ac16f87ab1e48  The site is obviously not encyclopedic (which,...      0   
3  341e18f9ffda370f         Also, I like to have sex with little boys.      1   
4  598ebda7901d913b  "\n\n FYI \n\nYou may be interested to know th...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        1       0       1              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [None]:
df=df[['comment_text','toxic']]

In [None]:
df.head()


Unnamed: 0,comment_text,toxic
0,"""\n\n Non-canonical Oz works by """"Royal Histor...",0
1,Just because you enjoy sucking an occasional n...,1
2,"The site is obviously not encyclopedic (which,...",0
3,"Also, I like to have sex with little boys.",1
4,"""\n\n FYI \n\nYou may be interested to know th...",0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_text  2000 non-null   object
 1   toxic         2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 31.4+ KB


In [None]:
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['toxic'])


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['comment_text'])

In [None]:
sequences = tokenizer.texts_to_sequences(df['comment_text'])
max_sequence_length = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = df['Label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
embedding_dim = 100
gru_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    GRU(128),
    Dense(1, activation='sigmoid')
])
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
gru_model.fit(X_train, y_train, epochs=40, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 51ms/step - accuracy: 0.5771 - loss: 0.6709 - val_accuracy: 0.8000 - val_loss: 0.4387
Epoch 2/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.8765 - loss: 0.3254 - val_accuracy: 0.7925 - val_loss: 0.4354
Epoch 3/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.9792 - loss: 0.0888 - val_accuracy: 0.8200 - val_loss: 0.4944
Epoch 4/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.9997 - loss: 0.0122 - val_accuracy: 0.8400 - val_loss: 0.6392
Epoch 5/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.9981 - loss: 0.0049 - val_accuracy: 0.8400 - val_loss: 0.6599
Epoch 6/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 1.0000 - loss: 0.0014 - val_accuracy: 0.8400 - val_loss: 0.7191
Epoch 7/40
[1m50/50[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7c98fa099f00>

In [None]:
gru_model.evaluate(X_test, y_test)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.8504 - loss: 1.1590


[1.0787733793258667, 0.8525000214576721]

In [None]:

custom_text = ["You're completely useless and your opinions are garbage. No one cares about what you have to say."]

custom_sequence = tokenizer.texts_to_sequences(custom_text)


custom_padded = pad_sequences(custom_sequence, maxlen=max_sequence_length)


prediction = gru_model.predict(custom_padded)


print(prediction)
if prediction[0][0] > 0.5:
    print("This comment is toxic.")
else:
    print("This comment is non-toxic.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[[0.9999678]]
This comment is toxic.


In [None]:

custom_text = ["Thank you for sharing your insights! I found your explanation very helpful and clear."]

custom_sequence = tokenizer.texts_to_sequences(custom_text)

custom_padded = pad_sequences(custom_sequence, maxlen=max_sequence_length)

prediction = gru_model.predict(custom_padded)

print(prediction)
if prediction[0][0] > 0.5:
    print("This comment is toxic.")
else:
    print("This comment is non-toxic.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[[3.333689e-07]]
This comment is non-toxic.
