In [1]:
# Toxic Comments - preprocess+embed+Lstm+Gru

import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd
import os

from nltk.corpus import stopwords
import gc
from keras import backend as K
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

from unidecode import unidecode
import time
eng_stopwords = set(stopwords.words("english"))

data_paths = {}
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        data_paths[filename] = os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))

Using TensorFlow backend.


/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec
/kaggle/input/glove-twitter-27b-200d-txt/glove.twitter.27B.200d.txt
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv


In [2]:
train_df = pd.read_csv(data_paths['train.csv'])
test_df = pd.read_csv(data_paths['test.csv'])
sub_df = pd.read_csv(data_paths['sample_submission.csv'])
print('Train shape:', train_df.shape)
print('Columns in Train:', train_df.columns)

Train shape: (159571, 8)
Columns in Train: Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')


In [3]:
drop_col = ['id', 'is_clean']
text_col = ['comment_text']
num_col = ['total_len', 'sent_count','word_count', 'capitals', 'punct_count', 'smilies_count',
           'unique_word_count', 'unique_word_percent']
label_col = [col for col in train_df.columns if col not in text_col + drop_col + num_col]
label_col

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
import re
special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

train_df['clean_text'] = train_df['comment_text'].apply(lambda x: clean_text(str(x)))
test_df['clean_text'] = test_df['comment_text'].apply(lambda x: clean_text(str(x)))

In [5]:
train_df = train_df.fillna(' ')
test_df = test_df.fillna(' ')

In [6]:
def add_features(df):
    
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df


train_df = add_features(train_df)
test_df = add_features(test_df)

In [7]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

train_val_counts = train_df[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_counts = test_df[['caps_vs_length', 'words_vs_unique']].fillna(0)

ss.fit(train_val_counts)
train_val_counts = ss.transform(train_val_counts)
test_counts = ss.transform(test_counts)

print(train_val_counts.shape, test_counts.shape)

(159571, 2) (153164, 2)


In [8]:
from keras.preprocessing import text, sequence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import scipy

X_train_val = train_df[text_col].values.ravel()
y_train_val = train_df[label_col].values
X_test = test_df[text_col].values.ravel()

# intialize param
max_features = 200000
maxlen = 520

# build vocab
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train_val)

# sequences
X_train_val = tokenizer.texts_to_sequences(X_train_val)
X_test = tokenizer.texts_to_sequences(X_test)

# padded sequences
X_train_val = sequence.pad_sequences(X_train_val, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print(X_train_val.shape, y_train_val.shape, X_test.shape)

(159571, 520) (159571, 6) (153164, 520)


In [10]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

# FASTTEXT

FASTTEXT_FILE = data_paths['crawl-300d-2M.vec']
fasttext_size = 300

EMBEDDING_FILE = open(FASTTEXT_FILE)

fasttext_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in EMBEDDING_FILE)
EMBEDDING_FILE.close()
print('completed loading fasttext vector file')

# intialize embedding matrix
fasttext_matrix = np.zeros((nb_words, fasttext_size))
# 
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = fasttext_index.get(word)
    if embedding_vector is not None: fasttext_matrix[i] = embedding_vector
        
print('completed loading fasttext embeddings')
        
del fasttext_index
gc.collect()

completed loading fasttext vector file
completed loading fasttext embeddings


0

In [11]:
GLOVE_FILE = data_paths['glove.twitter.27B.200d.txt']
glove_size = 200

EMBEDDING_FILE = open(GLOVE_FILE)

glove_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in EMBEDDING_FILE)
EMBEDDING_FILE.close()
print('completed loading glove vector file')

# intialize embedding matrix
glove_matrix = np.zeros((nb_words, glove_size))
# 
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = glove_index.get(word)
    if embedding_vector is not None: glove_matrix[i] = embedding_vector
        
print('completed loading glove embeddings')

del glove_index
gc.collect()

completed loading glove vector file
completed loading glove embeddings


0

In [12]:
embed_size = fasttext_size + glove_size # i.e. 300+200
embedding_matrix = np.concatenate((fasttext_matrix, glove_matrix), axis = 1)
del fasttext_matrix, glove_matrix
gc.collect()

0

- references:
    - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52644#latest-319962
    - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52564#latest-610859

In [13]:
from keras.models import Model
from keras import optimizers, callbacks, regularizers
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Conv1D, Layer
from keras.layers import GRU,LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNGRU, CuDNNLSTM
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                model.save_weights("best_weights.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 3:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True

In [15]:
def get_model(train_counts): # fasttext_size, fasttext_matrix
        count_input = Input(shape=(train_counts.shape[1],))
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.5)(x)
        x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)
        x, x_h, x_c = Bidirectional(CuDNNGRU(40, return_sequences=True, return_state = True))(x)  
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        conc = concatenate([avg_pool, x_h, max_pool,count_input])
        out = Dense(6, activation="sigmoid")(conc)
    
        model = Model(inputs=[inp,count_input], outputs=out)
        model.compile(loss='binary_crossentropy',
                      optimizer=optimizers.Adam(lr=0.001, clipvalue = 1.0),
                      metrics=['accuracy'])
        return model

# model = get_model()

In [16]:
num_folds = 10 # folds

y_test_predict = np.zeros((test_df.shape[0],6))

early_stop = callbacks.EarlyStopping(patience=3, monitor='val_loss', verbose=1)
reduce_lr = callbacks.ReduceLROnPlateau(patience=1, factor=0.5, monitor='val_loss', verbose=1)

kf = KFold(n_splits = num_folds, shuffle = True, random_state = 2019)

for train_index, val_index in kf.split(X_train_val):

    kfold_y_train, kfold_y_val = y_train_val[train_index], y_train_val[val_index]
    kfold_X_train, kfold_X_train_counts = X_train_val[train_index], train_val_counts[train_index]
    kfold_X_valid, kfold_X_val_counts = X_train_val[val_index], train_val_counts[val_index]
    
    gc.collect()
    K.clear_session()
    
    model = get_model(kfold_X_train_counts)
    
    ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_val_counts], kfold_y_val), interval = 1)
    
    model.fit([kfold_X_train,kfold_X_train_counts], kfold_y_train, 
              batch_size = 64, epochs = 10, verbose=1, callbacks = [ra_val, early_stop, reduce_lr])
    gc.collect()
    
    model.load_weights("best_weights.h5")
    
    y_test_predict += model.predict([X_test,test_counts], batch_size = 256, verbose=1) / num_folds

Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.979051 

*** New High Score (previous: 0.000000) 

Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.983700 

*** New High Score (previous: 0.979051) 

Epoch 3/10

 ROC-AUC - epoch: 4 - score: 0.987255 

*** New High Score (previous: 0.985944) 

Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.987226 

Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.988158 

*** New High Score (previous: 0.987255) 

Epoch 7/10

 ROC-AUC - epoch: 9 - score: 0.988256 

Epoch 10/10

 ROC-AUC - epoch: 9 - score: 0.991497 

Epoch 10/10

 ROC-AUC - epoch: 3 - score: 0.991490 

*** New High Score (previous: 0.990491) 

Epoch 4/10

 ROC-AUC - epoch: 6 - score: 0.991680 

Epoch 7/10

 ROC-AUC - epoch: 4 - score: 0.991665 

*** New High Score (previous: 0.991583) 

Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.990330 

*** New High Score (previous: 0.989814) 

Epoch 6/10

 ROC-AUC - epoch: 2 - score: 0.988961 

*** New High Score (previous: 0.984419) 

Epoch 3/10

In [17]:
sub_df.iloc[:,1:] = y_test_predict
display(sub_df.head())
from IPython.display import FileLink
sub_name = 'bi-lstm-gru-kfold-sub.csv'
sub_df.to_csv(sub_name, index = None)
FileLink(sub_name)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996907,0.461211,0.981336,0.142995,0.955312,0.485726
1,0000247867823ef7,0.000383,7e-06,4.6e-05,3e-06,3.8e-05,7e-06
2,00013b17ad220c46,0.000113,8e-06,4.2e-05,2e-06,3e-05,9e-06
3,00017563c3f7919a,8.2e-05,4e-06,1.8e-05,8e-06,1.8e-05,3e-06
4,00017695ad8997eb,0.001362,3.4e-05,0.000216,3.8e-05,0.00013,2.1e-05


In [19]:
# import matplotlib.pyplot as plt
# # plot training & validation results
# df = pd.DataFrame()
# df['train_loss'] = hist.history['loss']
# df['val_loss'] = hist.history['val_loss']
# df.index = np.arange(1,len(df)+1,1)
    
# # draw Loss
# df[['train_loss', 'val_loss']].plot()
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.show()