In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,Embedding,Input,Activation,LSTM,GlobalMaxPool1D,Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers, optimizers, layers
from sklearn.metrics import roc_auc_score
import re
import warnings
warnings.simplefilter(action="ignore")


In [None]:
!ls /kaggle/input/jigsaw-toxic-comment-classification-challenge/


In [None]:
train_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
train_df.head()


In [None]:
test_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_df.head()


In [None]:
train_df['comment_text'][0]


In [None]:
pd.DataFrame(train_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(),columns=['Count'])


In [None]:
temp = pd.DataFrame(train_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(),columns=['Count'])


In [None]:
sns.barplot(temp.index, temp['Count'])


In [None]:
from nltk import word_tokenize
train_df['tokenized_text'] = train_df.apply(lambda row: word_tokenize(row['comment_text']), axis=1)
lengths = [len(line) for line in train_df["tokenized_text"]]
train_df['comment_text'].iloc[np.argmax(lengths)]


In [None]:
import plotly.express as px
px.histogram(lengths)


In [None]:
from nltk.corpus import stopwords
def process_text(data):
    stop = stopwords.words('english')
    data['processed_text'] = data.apply(lambda row: row['comment_text'].replace("\n"," "), axis=1) ## Remove new lines
    data['processed_text'] = data.apply(lambda row: re.sub('http://\S+|https://\S+', 'urls',row['processed_text']).lower(), axis=1) # Remove URL's
    data['processed_text'] = data.apply(lambda row: re.sub('[^A-Za-z ]+', '',row['processed_text']).lower(), axis=1) # Removes special characters, punctuations except alphabets
    data['processed_text'] = data['processed_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # Removes Stop words
    data['processed_text'] = data.apply(lambda row: re.sub('  +', ' ',row['processed_text']).strip(), axis=1) # Removes extra spaces in between the words
    return data


In [None]:
train = process_text(train_df)
test = process_text(test_df)


In [None]:
train["processed_text"] = train.apply(lambda x: x["comment_text"] if len(x["processed_text"])==0 else x['processed_text'], axis=1)
test["processed_text"] = test.apply(lambda x: x["comment_text"] if len(x["processed_text"])==0 else x['processed_text'], axis=1)


In [None]:
num_words = 30000
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(train['processed_text'])
train_tokens = tokenizer.texts_to_sequences(train['processed_text'])
test_tokens = tokenizer.texts_to_sequences(test['processed_text'])
train_seq = pad_sequences(train_tokens, maxlen=300)
test_seq = pad_sequences(test_tokens, maxlen=300)


In [None]:
print(train_seq.shape, test_seq.shape)


In [None]:
train_labels = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping


In [None]:
filepath = '/kaggle/working/best_model_weights-{epoch:02d}.hdf5'
save_model_callback = ModelCheckpoint(filepath=filepath, monitor='val_auc', verbose=1,save_best_only=True, mode='max')


In [None]:
earlystop = EarlyStopping(monitor='val_auc', min_delta = 0.1, patience = 2, verbose = 1)


In [None]:
tf.keras.backend.clear_session()
input_layer = Input(shape = (300, ))
x = Embedding(30000, 200)(input_layer)
x = LSTM(60, return_sequences=True)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
output_layer = Dense(6, activation="sigmoid")(x)
model = Model(inputs=input_layer, outputs=output_layer)
model.summary()


In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy','AUC'])
model.fit(train_seq, train_labels, batch_size=128, validation_split=0.2, epochs = 5, callbacks=[save_model_callback, earlystop])


In [None]:
sample_submission = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip")


In [None]:
df_test = pd.merge(test, sample_submission, on = "id")


In [None]:
df_test.head()


In [None]:
y_pred = model.predict(test_seq)


In [None]:
df_test[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]] = y_pred
df_test.head()


In [None]:
df_test.drop(["comment_text", "processed_text"], axis = 1, inplace = True)
df_test.to_csv("sample_submission.csv", index = False)
