<a href="https://colab.research.google.com/github/SaumilShah-7/Toxic-Comment-Classification-Challenge-Kaggle/blob/master/Toxic_Comment_Classification_(LSTM_%2B_GRU_%26_Fasttext_%2B_Glove).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import gc
from tqdm.notebook import tqdm_notebook as tqdm

import tensorflow as tf
print(tf.__version__)

from tensorflow.keras.preprocessing import text, sequence

In [None]:
!unzip -q -o '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
!unzip -q -o '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
!unzip -q -o '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip'

In [None]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

print(train.shape)
print(test.shape)

In [None]:
import regex as re
!pip install Unidecode
from unidecode import unidecode

words_only = re.compile(r'[^A-Za-z\']')
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = words_only.sub(' ', x_ascii)
    return x_clean

train['clean_text'] = train['comment_text'].apply(lambda x: clean_text(x))
test['clean_text'] = test['comment_text'].apply(lambda x: clean_text(x))

In [None]:
print(train['comment_text'][1])
print(train['clean_text'][1])

In [None]:
train['clean_text'].fillna('something')
print(train[train.clean_text=='something'])
test['clean_text'].fillna('something')
print(test[test.clean_text=='something'])

In [None]:
max_features = 250000

In [None]:
t = text.Tokenizer(num_words=max_features)
t.fit_on_texts(list(train['clean_text'])+list(test['clean_text']))

print(len(t.word_index))

In [None]:
word_index = t.word_index
word_index

In [None]:
X_train = t.texts_to_sequences(train['clean_text'])
X_test = t.texts_to_sequences(test['clean_text'])

print(X_train[0])

In [None]:
l = list(map(len, X_train))
print('Min: %d, Mean: %d, Q3: %d, Max: %d' %(min(l), sum(l)/len(l), np.percentile(l, 75), max(l)))

In [None]:
toxicity_columns = list(train.columns)[2:-1]
print(toxicity_columns)

In [None]:
maxlen = 900
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

y_train = train[toxicity_columns].values

print(x_train.shape, y_train.shape)
print(x_test.shape)
print(toxicity_columns)

In [None]:
np.save('x_train.npy', x_train)
np.save('x_test.npy', x_test)
np.save('y_train.npy', y_train)

with open('word_index.pickle', 'wb') as handle:
  pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
del X_train, X_test, x_train, x_test, y_train, t, word_index, l

gc.collect()

In [None]:
ft_path = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
gl_path = '../input/glovetwitter27b100dtxt/glove.twitter.27B.200d.txt'

In [None]:
def get_coefs(word,*arr):
  return word, np.asarray(arr, dtype='float32')

In [None]:
# import gensim
# model = gensim.models.KeyedVectors.load_word2vec_format(ft_path)

# words = model.index2word

# w_rank = {}
# for i,word in enumerate(words):
#     w_rank[word] = i

# WORDS = w_rank

# del model, words, w_rank
# gc.collect()

In [None]:
# def words(text): return re.findall(r'\w+', text.lower())

# def P(word): 
#     "Probability of `word`."
#     # use inverse of rank as proxy
#     # returns 0 if the word isn't in the dictionary
#     return - WORDS.get(word, 0)

# def correction(word): 
#     "Most probable spelling correction for word."
#     return max(candidates(word), key=P)

# def candidates(word): 
#     "Generate possible spelling corrections for word."
#     return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

# def known(words): 
#     "The subset of `words` that appear in the dictionary of WORDS."
#     return set(w for w in words if w in WORDS)

# def edits1(word):
#     "All edits that are one edit away from `word`."
#     letters    = 'abcdefghijklmnopqrstuvwxyz'
#     splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
#     deletes    = [L + R[1:]               for L, R in splits if R]
#     transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
#     replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
#     inserts    = [L + c + R               for L, R in splits for c in letters]
#     return set(deletes + transposes + replaces + inserts)

# def edits2(word): 
#     "All edits that are two edits away from `word`."
#     return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
with open('word_index.pickle', 'rb') as handle:
    word_index = pickle.load(handle)

nb_words = min(max_features, len(word_index))
embed_size = 500
word_process = re.compile(r'[^A-Za-z]')

def getword(embeddings_keys, word):
    if word in embeddings_keys:
        return word
    elif word.lower() in embeddings_keys:
        return word.lower()
    elif word.upper() in embeddings_keys:
        return word.upper()
    elif word.capitalize() in embeddings_keys:
        return word.capitalize()
    elif word_process.sub('', word) in embeddings_keys:
        return word_process.sub('', word)
    elif len(word)>1 and len(word)<=15:
        x = correction(word)
        if x in embeddings_keys:
            return x

    return None

def build_matrix(nb_words, embed_size):
    embeddings_ft = dict(get_coefs(*o.strip().split()) for o in open(ft_path))
    embeddings_gl = dict(get_coefs(*o.strip().split()) for o in open(gl_path))
    embeddings_keys_ft = list(embeddings_ft.keys())
    
    corrected = []
    words_not_found = []
    matrix = np.zeros((nb_words, embed_size))
    
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            break
        else:
            word2 = getword(embeddings_keys_ft, word)
            if word2 is not None:
                matrix[i, :300] = embeddings_ft.get(word2)
                if embeddings_gl.get(word2) is not None:
                    matrix[i, 300:] = embeddings_gl.get(word2)
                if word2 != word:
                    corrected.append((word, word2))
            else:
                words_not_found.append(word)
                matrix[i, :300]=embeddings_ft.get("something")
                matrix[i, 300:]=embeddings_gl.get("something")
                
    return matrix, corrected, words_not_found

def build_matrix_1(nb_words, embed_size, correction_map):
    embeddings_ft = dict(get_coefs(*o.strip().split()) for o in open(ft_path))
    embeddings_gl = dict(get_coefs(*o.strip().split()) for o in open(gl_path))
    embeddings_keys_ft = list(embeddings_ft.keys())
    
    corrected = []
    words_not_found = []
    matrix = np.zeros((nb_words, embed_size))
    
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            break
        else:
            if embeddings_ft.get(word) is not None:
                matrix[i, :300] = embeddings_ft.get(word)
                if embeddings_gl.get(word) is not None:
                    matrix[i, 300:] = embeddings_gl.get(word)
            elif correction_map.get(word) is not None:
                word2 = correction_map.get(word)
                matrix[i, :300] = embeddings_ft.get(word2)
                if embeddings_gl.get(word2) is not None:
                    matrix[i, 300:] = embeddings_gl.get(word2)
                corrected.append((word, word2))
            else:
                words_not_found.append(word)
                matrix[i, :300]=embeddings_ft.get("something")
                matrix[i, 300:]=embeddings_gl.get("something")
        
                
    return matrix, corrected, words_not_found

In [None]:
with open('../input/mapping/correction_map_final.pickle', 'rb') as handle:
    correction_map = pickle.load(handle)

print(len(correction_map))

In [None]:
# embedding_matrix, corrected, words_not_found = build_matrix(nb_words, embed_size)
embedding_matrix, corrected, words_not_found = build_matrix_1(nb_words, embed_size, correction_map)

print(embedding_matrix.shape)

In [None]:
print(len(corrected))
print(corrected)

In [None]:
print(len(words_not_found))
print(words_not_found)

In [None]:
np.save('embedding_matrix.npy', embedding_matrix)

del embedding_matrix, words_not_found, corrected
gc.collect()

In [None]:
x_train = np.load('x_train.npy')
x_test = np.load('x_test.npy')
y_train = np.load('y_train.npy')
embedding_matrix = np.load('embedding_matrix.npy')

In [None]:
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Bidirectional, GRU, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, Dense, Conv1D, LSTM
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import plot_model
from tensorflow.keras import optimizers

def get_model():
  inp = Input(shape=(maxlen, ))
  x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
  x = SpatialDropout1D(0.5)(x)
  x = Bidirectional(LSTM(40, return_sequences=True))(x)
  x, x_h, x_c = Bidirectional(GRU(40, return_sequences=True, return_state=True))(x)
  # x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)
  avg_pool = GlobalAveragePooling1D()(x)
  max_pool = GlobalMaxPooling1D()(x)
  conc = concatenate([avg_pool, x_h, max_pool])
  outp = Dense(6, activation="sigmoid")(conc)
  
  model = Model(inputs=inp, outputs=outp)
  adam = optimizers.Adam(clipvalue=1.)
  model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

  return model

In [None]:
model = get_model()

print(model.summary())
plot_model(model, show_shapes=True)

In [None]:
from sklearn.model_selection import train_test_split

x_tra, x_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)

In [None]:
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from sklearn.metrics import roc_auc_score

class RocCallback(Callback):
  def __init__(self, validation_data):
    self.x_val = validation_data[0]
    self.max_score = 0
    self.y_val = validation_data[1]

  def on_epoch_end(self, epoch, logs={}):
    y_pred_val = self.model.predict(self.x_val)
    roc_val = roc_auc_score(self.y_val, y_pred_val)
    print(' - Roc-auc_val: %.6f \n' % roc_val)
    if roc_val > self.max_score:
      self.model.save('best_model.h5')
      print('Saving model weights at Epoch: %d, Roc-auc_val: %.6f \n'  % (epoch+1, roc_val))
      self.max_score = roc_val
    return

roc = RocCallback(validation_data=(x_val, y_val))

In [None]:
# from sklearn.model_selection import KFold
# import tensorflow.keras.backend as K

# num_folds = 5
# batch_size = 128
# epochs = 2

# predict = np.zeros((test.shape[0],6))
# oof_predict = np.zeros((train.shape[0],6))
# scores = []

# kf = KFold(n_splits=num_folds, shuffle=True, random_state=239)

# for train_index, val_index in kf.split(x_train):
#   kf_y_train, kf_y_val = y_train[train_index], y_train[val_index]
#   kf_x_train, kf_x_val = x_train[train_index], x_train[val_index]
  
#   K.clear_session()

#   model = get_model()
#   ra_val = RocCallback(validation_data=(kf_x_val, kf_y_val))
#   model.fit(kf_x_train, kf_y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[ra_val])

#   model.load_weights('best_model.h5')
#   predict += model.predict(x_test, batch_size=batch_size, verbose=1) / num_folds

#   oof_predict[val_index] = model.predict(kf_x_val, batch_size=batch_size, verbose=1)
#   cv_score = roc_auc_score(kf_y_val, oof_predict[val_index])
#   scores.append(cv_score)
#   print('score: ', cv_score)

# print('Done')
# print('Total CV score is %.6f' % np.mean(scores))

In [None]:
batch_size = 32
epochs = 2

hist = model.fit(x_tra, y_tra, batch_size=batch_size, callbacks=[roc], epochs=epochs, validation_data=(x_val, y_val), verbose=1)

In [None]:
model = load_model('best_model.h5')
# model.save_weights('best_model_weights.h5')
y_pred = model.predict(x_test, batch_size=128)

In [None]:
submid = pd.DataFrame({'id': test['id']})
submission = pd.concat([submid, pd.DataFrame(y_pred, columns = toxicity_columns)], axis=1)
submission.to_csv('submission.csv', index=False)