In [None]:
import zipfile

z= zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
z.extractall()
z= zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
z.extractall()
z= zipfile.ZipFile('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
z.extractall()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
! pip install contractions

In [None]:
import contractions
from sklearn.feature_extraction.text import CountVectorizer
import string
import nltk
from nltk.tokenize import word_tokenize
import re 
from nltk.corpus import stopwords

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

from sklearn import metrics

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stopwords = stopwords.words('english')
print(stopwords.remove('not'))
# print(stopwords)

In [None]:
df = pd.read_csv("/kaggle/working/train.csv", index_col = 'id')
df.head()

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
for col in df.select_dtypes("int"):
    print(df[col].value_counts())
    print("----------------------")

In [None]:
def drop_stop_words(text):
    words = word_tokenize(text)
    wordsFiltered = []
    for w in words:
        if w.lower() not in stopwords:
            wordsFiltered.append(w)

    wordsFiltered = " ".join(wordsFiltered)
    return wordsFiltered


def remove_elongation(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1", text)


def processing(text):
    try:
        url_pattern = r'https?://\S+|www\.\S+'
        text = re.sub(url_pattern, '', text)
        
        text = re.sub(r'#', '', text).strip()
        
        text = contractions.fix(text) # remove_punc
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        extract_words = re.compile(r'\W+')  # remove_non_word
        text = extract_words.sub(' ', text)

        text = drop_stop_words(text)
        text = re.sub('[^a-zA-Z\s]', '', text) # remove_non_English_word
        
        text = re.sub(r'user(?:name)?\s', '', text) # remove_username
        text = remove_elongation(text)

        return text
    except Exception as e:
        print("Error processing text:", e)
        return text  # Return the original text if an error occurs

In [None]:
df_copy = df.copy()

In [None]:
df_copy['comment_text'] = df_copy["comment_text"].apply(lambda x: processing(x))

In [None]:
df_copy.isnull().sum()

In [None]:
df_copy.duplicated().sum()

In [None]:
df_copy.drop_duplicates(inplace = True)

In [None]:
df_copy.columns.tolist()

In [None]:
df_copy.info()

In [None]:
def length_plot(data):
    length = [len(sentence.split()) for sentence in data]
    plt.hist(length, bins = 40)
    plt.show()
    print(f'Avg: {np.average(length)}')
    print(f'Max: {np.max(length)}\n')
    
length_plot(df_copy["comment_text"])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer()
tok.fit_on_texts(df_copy['comment_text'])
words = tok.word_index

In [None]:
len(words)

In [None]:
sequence = tok.texts_to_sequences(df_copy['comment_text'])

In [None]:
idx2word = {words[word]: word for word in words.keys()}

In [None]:
def reconstruct(tokens):
    text = []
    for token in tokens:
        text.append(idx2word[token])
    return " ".join(text)
reconstruct(sequence[0])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_sequence_len = 150
x_padded = pad_sequences(sequence, padding="post", maxlen=max_sequence_len)
x_padded

In [None]:
df_label = df_copy.iloc[:, 1:]
df_label

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_padded, df_label, test_size=0.1, random_state=42)

In [None]:
x_train.shape

In [None]:
x_test

In [None]:
VOCAB_SIZE = len(words) + 1
VOCAB_SIZE

In [None]:
VECTOR_FEATURES = 128
lstm_bi = tf.keras.models.Sequential([    
    tf.keras.layers.Embedding(VOCAB_SIZE, VECTOR_FEATURES),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='sigmoid')
])

In [None]:
roc_auc_metric = tf.keras.metrics.AUC()
lstm_bi.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(lr = 1e-03),
              metrics=["accuracy"])

In [None]:
history = lstm_bi.fit(x_train, y_train, epochs=5, batch_size=128,
                    validation_data=(x_test, y_test))

In [None]:
test_loss, test_acc = lstm_bi.evaluate(x_test, y_test)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
test_df = pd.read_csv('/kaggle/working/test.csv')
test_df.info()

In [None]:
test_df['comment_text'] = test_df["comment_text"].apply(lambda x: processing(x))
test_df

In [None]:
test = tok.texts_to_sequences(test_df['comment_text'])
test = pad_sequences(test, padding="post", maxlen=max_sequence_len)
predicted = lstm_bi.predict(test)

In [None]:
predict_df = pd.DataFrame(predicted, columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [None]:
sub = pd.concat((test_df['id'], predict_df), axis = 1)

In [None]:
sub.to_csv("submission.csv", index= False)