In [1]:
# Toxic Comments - preprocess+embed+gru+conv1d

import warnings
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd
import os

from nltk.corpus import stopwords
import gc
from keras import backend as K
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

from unidecode import unidecode
import time
eng_stopwords = set(stopwords.words("english"))

data_paths = {}
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        data_paths[filename] = os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))

Using TensorFlow backend.


/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec
/kaggle/input/glove-twitter-27b-200d-txt/glove.twitter.27B.200d.txt
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv
/kaggle/input/toxic-data-preprocessing/__output__.json
/kaggle/input/toxic-data-preprocessing/test_preprocessed.csv
/kaggle/input/toxic-data-preprocessing/train_preprocessed.csv


In [2]:
train_df = pd.read_csv(data_paths['train.csv'])
test_df = pd.read_csv(data_paths['test.csv'])
sub_df = pd.read_csv(data_paths['sample_submission.csv'])
print('Train shape:', train_df.shape)
print('Columns in Train:', train_df.columns)

Train shape: (159571, 8)
Columns in Train: Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')


In [3]:
drop_col = ['id', 'is_clean']
text_col = ['comment_text']
num_col = ['total_len', 'sent_count','word_count', 'capitals', 'punct_count', 'smilies_count',
           'unique_word_count', 'unique_word_percent']
label_col = [col for col in train_df.columns if col not in text_col + drop_col]
label_col

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
def add_features(df):
    
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df


train_df = add_features(train_df)
test_df = add_features(test_df)

train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [5]:
import re
special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

train_df['clean_text'] = pd.read_csv(data_paths['train_preprocessed.csv'])['comment_text']
test_df['clean_text'] = pd.read_csv(data_paths['test_preprocessed.csv'])['comment_text']

train_df = train_df.fillna(' ')
test_df = test_df.fillna(' ')

In [6]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

train_val_counts = train_df[['caps_vs_length', 'words_vs_unique']].fillna(0)
test_counts = test_df[['caps_vs_length', 'words_vs_unique']].fillna(0)

ss.fit(train_val_counts)
train_val_counts = ss.transform(train_val_counts)
test_counts = ss.transform(test_counts)

print(train_val_counts.shape, test_counts.shape)

(159571, 2) (153164, 2)


In [7]:
from keras.preprocessing import text, sequence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import scipy

X_train_val = train_df[text_col].values.ravel()
y_train_val = train_df[label_col].values
X_test = test_df[text_col].values.ravel()


# intialize param
max_features = 90000
maxlen = 250

# build vocab
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train_val)

# sequences
X_train_val = tokenizer.texts_to_sequences(X_train_val)
X_test = tokenizer.texts_to_sequences(X_test)

# padded sequences
X_train_val = sequence.pad_sequences(X_train_val, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

print(X_train_val.shape, y_train_val.shape, X_test.shape)

(159571, 250) (159571, 6) (153164, 250)


In [9]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

# FASTTEXT

FASTTEXT_FILE = data_paths['crawl-300d-2M.vec']
fasttext_size = 300

EMBEDDING_FILE = open(FASTTEXT_FILE)

fasttext_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in EMBEDDING_FILE)
EMBEDDING_FILE.close()
print('completed loading fasttext vector file')

# intialize embedding matrix
fasttext_matrix = np.zeros((nb_words, fasttext_size))
# 
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = fasttext_index.get(word)
    if embedding_vector is not None: fasttext_matrix[i] = embedding_vector
        
print('completed loading fasttext embeddings')
        
del fasttext_index
gc.collect()

completed loading fasttext vector file
completed loading fasttext embeddings


0

- references:
    - https://www.kaggle.com/fizzbuzz/toxic-data-preprocessing
    - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52581#latest-302637
    - https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52647#latest-300704
- trained embeddings + GRU(16) = 0.9753, 0.9752
- pretrained fasttext embeddings + GRU(128) + Conv1d(64) = 0.9843, 0.9840
- improved preprocessing + pretrained fasttext embeddings + GRU(128) + Conv1d(64) = 0.9849, 0.9851

In [12]:
from keras.models import Model
from keras import optimizers, callbacks, regularizers
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Conv1D, Layer
from keras.layers import GRU,LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNGRU, CuDNNLSTM
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("### New High Score (previous: %.6f) ###\n" % self.max_score)
                model.save_weights("best_weights.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 3:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True

In [14]:
def get_model(train_counts):
        inp = Input(shape=(maxlen, ))
        count_input = Input(shape=(train_counts.shape[1],))
        x = Embedding(max_features, fasttext_size, weights=[fasttext_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.5)(x)
        x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x) #80
        x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        conc = concatenate([avg_pool, max_pool, count_input])
        out = Dense(6, activation="sigmoid")(conc)
    
        model = Model(inputs=[inp,count_input], outputs=out)
        model.compile(loss='binary_crossentropy',
                      optimizer=optimizers.Adam(lr=0.001),
                      metrics=['accuracy'])
        return model

# model = get_model()

In [15]:
num_folds = 10 # folds

y_test_predict = np.zeros((test_df.shape[0],6))

reduce_lr = callbacks.ReduceLROnPlateau(patience=1, factor = 0.5, monitor='val_loss', verbose=1)

kf = KFold(n_splits = num_folds, shuffle = True, random_state = 2019)

for train_index, val_index in kf.split(X_train_val):

    kfold_y_train, kfold_y_val = y_train_val[train_index], y_train_val[val_index]
    kfold_X_train, kfold_X_train_counts = X_train_val[train_index], train_val_counts[train_index]
    kfold_X_valid, kfold_X_val_counts = X_train_val[val_index], train_val_counts[val_index]
    
    gc.collect()
    K.clear_session()
    
    model = get_model(kfold_X_train_counts)
    
    ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_val_counts], kfold_y_val), interval = 1)
    
    model.fit([kfold_X_train,kfold_X_train_counts], kfold_y_train, 
              batch_size = 32, epochs = 10, verbose=1, callbacks = [ra_val, reduce_lr])
    gc.collect()
    
    model.load_weights("best_weights.h5")
    
    y_test_predict += model.predict([X_test,test_counts], batch_size = 256, verbose=1) / num_folds

Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.980418 

### New High Score (previous: 0.000000) ###

Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.983300 

### New High Score (previous: 0.980418) ###

Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.985729 

### New High Score (previous: 0.983300) ###

Epoch 4/10

 ROC-AUC - epoch: 5 - score: 0.984280 

Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.986346 

### New High Score (previous: 0.985729) ###

Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.985107 

Epoch 8/10

 ROC-AUC - epoch: 8 - score: 0.982835 

Epoch 9/10

 ROC-AUC - epoch: 3 - score: 0.990295 

### New High Score (previous: 0.989509) ###

Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.989892 

Epoch 5/10

 ROC-AUC - epoch: 3 - score: 0.985643 

Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.987211 

### New High Score (previous: 0.986210) ###

Epoch 5/10

In [16]:
sub_df.iloc[:,1:] = y_test_predict
display(sub_df.head())
from IPython.display import FileLink
sub_name = 'bi-gru-conv1d-kfold-preprocessed.csv'
sub_df.to_csv(sub_name, index = None)
FileLink(sub_name)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.99662,0.400527,0.987518,0.155343,0.967755,0.534644
1,0000247867823ef7,0.001511,3.1e-05,0.000238,1.6e-05,0.000252,3.9e-05
2,00013b17ad220c46,0.00076,6.1e-05,0.000221,2.7e-05,0.000273,6.2e-05
3,00017563c3f7919a,0.000261,1.2e-05,0.000119,2.2e-05,0.000188,1e-05
4,00017695ad8997eb,0.005477,0.000129,0.000899,0.000119,0.000673,7.4e-05


In [18]:
# import matplotlib.pyplot as plt
# # plot training & validation results
# df = pd.DataFrame()
# df['train_loss'] = hist.history['loss']
# df['val_loss'] = hist.history['val_loss']
# df.index = np.arange(1,len(df)+1,1)
    
# # draw Loss
# df[['train_loss', 'val_loss']].plot()
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.show()

In [20]:
#

In [21]:
# import pandas as pd
# import sys
# from scipy.stats import ks_2samp

# first_file = sys.argv[1]
# second_file = sys.argv[2]

# def corr(first_file, second_file):
#     # assuming first column is `class_name_id`
#     first_df = pd.read_csv(first_file, index_col=0)
#     second_df = pd.read_csv(second_file, index_col=0)
#     class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#     for class_name in class_names:
#         # all correlations
#         print('\n Class: %s' % class_name)
#         print(' Pearson\'s correlation score: %0.6f' %
#               first_df[class_name].corr(
#                   second_df[class_name], method='pearson'))
#         print(' Kendall\'s correlation score: %0.6f' %
#               first_df[class_name].corr(
#                   second_df[class_name], method='kendall'))
#         print(' Spearman\'s correlation score: %0.6f' %
#               first_df[class_name].corr(
#                   second_df[class_name], method='spearman'))
#         ks_stat, p_value = ks_2samp(first_df[class_name].values,
#                                     second_df[class_name].values)
#         print(' Kolmogorov-Smirnov test:    KS-stat = %.6f    p-value = %.3e\n'
#               % (ks_stat, p_value))

# corr(first_file, second_file)