In [48]:
import sys
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [49]:
# !unzip '/content/drive/MyDrive/Dataset/archive.zip' -d '/content/drive/MyDrive/Dataset/glove'

In [50]:
EMBEDDING_FILE='/content/drive/MyDrive/Dataset/glove/glove.6B.50d.txt'
TRAIN_DATA_FILE='/content/drive/MyDrive/Dataset/train(1).csv'

In [51]:
train_data = pd.read_csv(TRAIN_DATA_FILE)

In [52]:
train_data.head(7)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0


In [53]:
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '', text) # Remove IP addresses
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
train_data['comment_text'] = train_data['comment_text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [54]:
X_train = train_data["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train_data[list_classes].values
del train_data
# list_sentences_test = test["comment_text"].fillna("_na_").values

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [56]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [57]:
X_t

array([[    0,     0,     0, ...,   702,  1134,   719],
       [    0,     0,     0, ...,  9677,   451,  1271],
       [   50,  7833,  6538, ...,  1845,   386,  5535],
       ...,
       [    0,     0,     0, ...,    93, 15938,  4641],
       [    0,     0,     0, ...,  5202,  2249,  3167],
       [    0,     0,     0, ...,  1108,  3426,   796]], dtype=int32)

In [58]:
def get_coefs(word,*arr):
  return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [59]:
embeddings_list = list(embeddings_index.values())
all_embs = np.stack(embeddings_list)

In [60]:
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [61]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

# sigmoid

In [15]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x) # softmax
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.fit(X_t, y_train, batch_size=32, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7cbe9b120e80>

In [17]:
y_predict = model.predict([X_te], batch_size=1024, verbose=1)



In [18]:
from sklearn.metrics import accuracy_score

# print('Accuracy Score: ', accuracy_score(y_test, y_predict))

In [19]:
y_test

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0]])

In [20]:
y_predict

array([[2.4653517e-01, 4.2641937e-04, 1.3273870e-02, 4.8185340e-03,
        5.2660234e-02, 1.4039984e-02],
       [2.7344652e-04, 8.1704236e-08, 3.1968670e-05, 5.6563198e-07,
        3.1772783e-05, 2.2806498e-06],
       [8.2458714e-03, 1.4695460e-05, 1.5547239e-03, 8.7321030e-05,
        1.7027589e-03, 1.5934784e-04],
       ...,
       [1.7363590e-03, 3.3814177e-07, 8.0356294e-05, 3.9491506e-06,
        1.5948206e-04, 9.9110102e-06],
       [9.6482271e-01, 9.1782987e-02, 7.2374964e-01, 4.9303487e-02,
        6.8023688e-01, 3.2814901e-02],
       [1.5431989e-03, 6.1915750e-07, 1.4629702e-04, 3.9061802e-06,
        1.8868085e-04, 2.4040857e-05]], dtype=float32)

In [21]:
# Binarize y_predict using a threshold of 0.5
y_predict_binarized = (y_predict >= 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_predict_binarized)

print("Accuracy:", accuracy)

Accuracy: 0.9216669277769074


In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict_binarized))

              precision    recall  f1-score   support

           0       0.86      0.75      0.80      3056
           1       0.71      0.07      0.12       321
           2       0.84      0.80      0.82      1715
           3       0.00      0.00      0.00        74
           4       0.73      0.70      0.72      1614
           5       0.64      0.33      0.44       294

   micro avg       0.82      0.70      0.75      7074
   macro avg       0.63      0.44      0.48      7074
weighted avg       0.80      0.70      0.73      7074
 samples avg       0.07      0.06      0.06      7074



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# softmax

In [62]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="softmax")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [63]:
model.fit(X_t, y_train, batch_size=32, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7cbea2b52e30>

In [None]:
y_predict = model.predict([X_te], batch_size=1024, verbose=1)

In [None]:
y_predict

In [None]:
# Binarize y_predict using a threshold of 5
y_predict_binarized = (y_predict >= 5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_predict_binarized)

print("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, y_predict_binarized))