In [17]:
import sys
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [18]:
# !unzip '/content/drive/MyDrive/Dataset/archive.zip' -d '/content/drive/MyDrive/Dataset/glove'

In [19]:
EMBEDDING_FILE='/content/drive/MyDrive/Dataset/glove/glove.6B.50d.txt'
TRAIN_DATA_FILE='/content/drive/MyDrive/Dataset/train(1).csv'

In [20]:
train_data = pd.read_csv(TRAIN_DATA_FILE)

In [21]:
train_data.head(7)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0


In [22]:
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '', text) # Remove IP addresses
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
train_data['comment_text'] = train_data['comment_text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
X_train = train_data["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train_data[list_classes].values
del train_data
# list_sentences_test = test["comment_text"].fillna("_na_").values

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [25]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [26]:
X_t

array([[    0,     0,     0, ...,   702,  1134,   719],
       [    0,     0,     0, ...,  9677,   451,  1271],
       [   50,  7833,  6538, ...,  1845,   386,  5535],
       ...,
       [    0,     0,     0, ...,    93, 15938,  4641],
       [    0,     0,     0, ...,  5202,  2249,  3167],
       [    0,     0,     0, ...,  1108,  3426,   796]], dtype=int32)

In [27]:
def get_coefs(word,*arr):
  return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [28]:
embeddings_list = list(embeddings_index.values())
all_embs = np.stack(embeddings_list)

In [29]:
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

In [30]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [32]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
model.fit(X_t, y_train, batch_size=32, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7ff0787a3f40>

In [35]:
y_predict = model.predict([X_te], batch_size=1024, verbose=1)



In [36]:
from sklearn.metrics import accuracy_score

print('Accuracy Score: ', accuracy_score(y_test, y_predict))

ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

In [38]:
y_test

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0]])

In [37]:
y_predict

array([[2.6530322e-01, 9.1741531e-04, 2.1044640e-02, 7.0091402e-03,
        6.6417813e-02, 2.6162550e-02],
       [2.8513299e-04, 1.5844807e-07, 7.0396643e-05, 3.4710938e-06,
        1.4671444e-05, 4.5594784e-06],
       [9.6194055e-03, 3.4679037e-05, 2.0043480e-03, 3.3234339e-04,
        1.2599294e-03, 2.9777328e-04],
       ...,
       [1.9713447e-03, 1.0318071e-06, 2.8411651e-04, 1.5611193e-05,
        1.4070026e-04, 2.8625596e-05],
       [9.3902183e-01, 1.2065660e-01, 4.7514296e-01, 9.8312408e-02,
        6.6858751e-01, 6.2587894e-02],
       [1.3786304e-02, 8.7200697e-06, 1.3291858e-03, 9.2155453e-05,
        1.5335295e-03, 1.5936441e-04]], dtype=float32)

In [40]:
# Binarize y_predict using a threshold of 0.5
y_predict_binarized = (y_predict >= 0.5).astype(int)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_predict_binarized)

print("Accuracy:", accuracy)

Accuracy: 0.920695597681341
