In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd

train = pd.read_csv('train.csv')

# train = train.sample(frac=0.1)  # 157975 original total, so let's prototype models with a fraction of that
validation_fraction = 0.1  # change to 1% for training on complete training set

In [3]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")

X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test["comment_text"].str.lower()

In [4]:
%%time

from keras.preprocessing import text, sequence

max_features = None  # 30000
embed_size = 300  # should match embedding file

tokenizer = text.Tokenizer(num_words=max_features)
all_comments = list(X_train) + list(X_test)
tokenizer.fit_on_texts(all_comments)
X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


CPU times: user 23.8 s, sys: 204 ms, total: 24 s
Wall time: 24 s


In [5]:
max_sequence_length = 300  # 150  # 128  # 100, 200, 256 worsened validation AUC score  # 100

X_train_padded = sequence.pad_sequences(X_train_tokenized, maxlen=max_sequence_length)
X_test_padded = sequence.pad_sequences(X_test_tokenized, maxlen=max_sequence_length)

In [6]:
import numpy as np

word_index = tokenizer.word_index  # len(word_index) == 394787
word_count = min(max_features, len(word_index)) if max_features else len(word_index)
# embedding_matrix = np.random.uniform(-1.0, 1.0, (nb_words, embed_size))  # in case you don't want to use pre-trained embeddings

In [7]:
%%time

import numpy as np
import bcolz


def process_fasttext_line(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(matrix, embeddings_index, word_index):
    for word, i in word_index.items():
        if max_features and i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            matrix[i] = embedding_vector
    return matrix

def build_embedding_matrix(matrix, bcolz_rootdir, embeddings_filename, line_processing_function, word_index):
    try:
        matrix = bcolz.open(rootdir=bcolz_rootdir)
    except FileNotFoundError:
        embeddings_index = dict(line_processing_function(*line.rstrip().rsplit()) for line in open(embeddings_filename))
        matrix = load_embeddings(matrix, embeddings_index, word_index)
        matrix = bcolz.carray(matrix, rootdir=bcolz_rootdir)
        matrix.flush()
    return matrix


fasttext_embedding_matrix = np.zeros((word_count + 1, embed_size))
fasttext_embedding_matrix = build_embedding_matrix(fasttext_embedding_matrix, "crawl-300d-2M.bcolz", "crawl-300d-2M.vec", process_fasttext_line, word_index)


def process_numberbatch_line(key, *arr):
    word = key.split('/')[-1].replace('_', ' ')
    return word, np.asarray(arr, dtype='float32')


numberbatch_embedding_matrix = np.zeros((word_count + 1, embed_size))
numberbatch_embedding_matrix = build_embedding_matrix(numberbatch_embedding_matrix, "numberbatch-17.06.bcolz", "numberbatch-17.06.txt", process_numberbatch_line, word_index)

def process_glove_line(*arr):
    word = arr[0]
    coefficients = arr[1:]
    if arr[1] == '.':
        if arr[4] == '.':
            word = ". . . . ."
            coefficients = arr[5:]
        elif arr[3] == '.':
            word = ". . . ."
            coefficients = arr[4:]
        elif arr[2] == '.':
            word = ". . ."
            coefficients = arr[3:]
        else:
            word = ". ."
            coefficients = arr[2:]
    elif arr[1] == "name@domain.com" or arr[1] == "Killerseats.com" or arr[1] == "mylot.com" or arr[1] == "Amazon.com":
        word = arr[0] + ' ' + arr[1]
        coefficients = arr[2:]
    try:
        return word, np.asarray(coefficients, dtype='float32')
    except ValueError:
        print(arr)

glove_embedding_matrix = np.zeros((word_count + 1, embed_size))
glove_embedding_matrix = build_embedding_matrix(glove_embedding_matrix, "glove.840B.300d.bcolz", "glove.840B.300d.txt", process_glove_line, word_index)

CPU times: user 359 ms, sys: 12.2 ms, total: 372 ms
Wall time: 391 ms


In [8]:
from keras.models import Model, Sequential
from keras.layers import Activation, BatchNormalization, Bidirectional, concatenate, Conv1D, CuDNNGRU, Dense, Dropout, Embedding, Flatten, Input, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, K, SpatialDropout1D
from keras.optimizers import Adam, Nadam
# from keras.regularizers import l2
# from keras.constraints import maxnorm

# from qrnn import QRNN
# from attention import AttentionWithContext
from capsnet import Capsule

def build_model(max_sequence_length, word_count, embed_size, embeddings, spatial_dropout=0.28, dropout=0.25):
    i = Input(shape=(max_sequence_length, ))
    fasttext = Embedding(word_count + 1, embed_size, weights=[embeddings["fasttext"][:word_count + 2, ]], trainable=False)(i)
    numberbatch = Embedding(word_count + 1, embed_size, weights=[embeddings["numberbatch"][:word_count + 2, ]], trainable=False)(i)
    # glove = Embedding(word_count + 1, embed_size, weights=[embeddings["glove"][:word_count + 2, ]], trainable=False)(i)
    fasttext = SpatialDropout1D(spatial_dropout)(fasttext)
    numberbatch = SpatialDropout1D(spatial_dropout)(numberbatch)
    # glove = SpatialDropout1D(spatial_dropout)(glove)
    rnn_size = 128  # max_sequence_length  # 140
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)  # 2nd bidirectional layer didn't help with training subsample
    # x = QRNN(rnn_size, window_size=7, return_sequences=True)(x)
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)
    # x = Bidirectional(GRU(rnn_size, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    # x = Bidirectional(GRU(64, return_sequences=True,dropout=0.3,recurrent_dropout=0.3))(x)
    # x = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(x)
    fasttext = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(fasttext)
    numberbatch = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(numberbatch)
    # glove = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(glove)
    fasttext = Capsule()(fasttext)
    numberbatch = Capsule()(numberbatch)
    # glove = Capsule()(glove)
    # attention = AttentionWithContext()(x)
    x = concatenate([
        fasttext, 
        numberbatch, 
        # glove
    ])
    x = Flatten()(x)
    d = Dropout(dropout)(x)
    multiclass_label_count = 6
    out = Dense(multiclass_label_count, activation="sigmoid")(d)
    model = Model(inputs=i, outputs=out)
    optimizer = "adam"  # Nadam(lr=1e-3)  # 'nadam'  # Nadam(lr=1e-5)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

# del model
embeddings = { "fasttext" : fasttext_embedding_matrix, 
              "numberbatch" : numberbatch_embedding_matrix,
              # "glove" : glove_embedding_matrix
             }
# model = build_model(max_sequence_length, word_count, embed_size, embeddings)
# del models
# models = [fasttext_model, numberbatch_model]

In [9]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau


# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_padded, y_train, test_size=validation_fraction)


class RocAucEvaluation(Callback):
    """https://www.kaggle.com/demesgal/lstm-glove-lr-decrease-bn-cv-lb-0-047/comments"""
    
    def __init__(self, validation_data=(), max_epoch=20, cross_validation_fold=None):
        super(Callback, self).__init__()

        self.stopped_epoch = max_epoch
        self.best = 0.0
        self.X_val, self.y_val = validation_data
        self.y_pred = np.zeros(self.y_val.shape)
        self.cross_validation_fold = cross_validation_fold  # current fold number (integer)

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_val, verbose=1)
        """Important lines"""
        current = roc_auc_score(self.y_val, y_pred)
        logs['val_auc'] = current

        if current > self.best:  # save model
            self.best = current
            self.y_pred = y_pred
            self.stopped_epoch = epoch + 1
            filename = "weights.{epoch:02d}-{val_auc:.4f}.hdf5".format(epoch=(epoch + 1), val_auc=current)
            if self.cross_validation_fold is not None:
                filename = "weights.{fold:02d}-{epoch:02d}-{val_auc:.4f}.hdf5".format(fold=self.cross_validation_fold, epoch=(epoch + 1), val_auc=current)  
            print("saving " + filename)
            self.model.save_weights(filename, overwrite=True)

        print("val_auc: {:.4f}".format(current))

In [10]:
batch_size = 256  # 32  # 128  # 1024 lowered AUC score even when tried continued training with bigger batch size after small batch size, as well as starting with big batch size and then continuing with smaller size
epochs = 16

kf = KFold(n_splits=10)
fold = 0
for train_index, val_index in kf.split(X_train_padded, y_train):
    checkpoint = ModelCheckpoint("weights.{epoch:2d}-{val_loss:.4f}.hdf5", 
                                 verbose=1, 
                                 # save_best_only=True, 
                                 save_weights_only=True)
    lr_reduction = ReduceLROnPlateau(patience=1, verbose=1)
    
    X_train_split, X_val_split = X_train_padded[train_index], X_train_padded[val_index]
    y_train_split, y_val_split = y_train[train_index], y_train[val_index]
    
    model = build_model(max_sequence_length, word_count, embed_size, embeddings)
    auc = RocAucEvaluation(validation_data=(X_val_split, y_val_split), cross_validation_fold=fold)
    history = model.fit(X_train_split, y_train_split, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    # validation_split=0.0,
                    validation_data=(X_val_split, y_val_split),
                    callbacks=[auc, 
                               # checkpoint, 
                               # lr_reduction
                              ], 
                    verbose=1)
    del model
    K.clear_session()
    fold += 1

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 143613 samples, validate on 15958 samples
Epoch 1/16
saving weights.00-01-0.9748.hdf5
val_auc: 0.9748
Epoch 2/16
saving weights.00-02-0.9844.hdf5
val_auc: 0.9844
Epoch 3/16
saving weights.00-03-0.9875.hdf5
val_auc: 0.9875
Epoch 4/16
saving weights.00-04-0.9891.hdf5
val_auc: 0.9891
Epoch 5/16
saving weights.00-05-0.9898.hdf5
val_auc: 0.9898
Epoch 6/16
saving weights.00-06-0.9901.hdf5
val_auc: 0.9901
Epoch 7/16
saving weights.00-07-0.9903.hdf5
val_auc: 0.9903
Epoch 8/16
val_auc: 0.9901
Epoch 9/16
saving weights.00-09-0.9903.hdf5
val_auc: 0.9903
Epoch 10/16
val_auc: 0.9903
Epoch 11/16
val_auc: 0.9902
Epoch 12/16
val_auc: 0.9902
Epoch 13/16
saving weights.00-13-0.9904.hdf5
val_auc: 0.9904
Epoch 14/16
val_auc: 0.9903
Epoch 15/16
val_auc: 0.9902
Epoch 16/16
val_auc: 0.9900
Train on 143614 samples, validate on 15957 samples
Epoch 1/16
saving weights.01-01-0.9788.hdf5
val_auc: 0.9788
Epoch 2/16
saving w

saving weights.02-01-0.9795.hdf5
val_auc: 0.9795
Epoch 2/16
saving weights.02-02-0.9858.hdf5
val_auc: 0.9858
Epoch 3/16
saving weights.02-03-0.9898.hdf5
val_auc: 0.9898
Epoch 4/16
saving weights.02-04-0.9907.hdf5
val_auc: 0.9907
Epoch 5/16
saving weights.02-05-0.9911.hdf5
val_auc: 0.9911
Epoch 6/16
val_auc: 0.9909
Epoch 7/16
saving weights.02-07-0.9913.hdf5
val_auc: 0.9913
Epoch 8/16
saving weights.02-08-0.9915.hdf5
val_auc: 0.9915
Epoch 9/16
val_auc: 0.9914
Epoch 10/16
saving weights.02-10-0.9915.hdf5
val_auc: 0.9915
Epoch 11/16
val_auc: 0.9915
Epoch 12/16
saving weights.02-12-0.9915.hdf5
val_auc: 0.9915
Epoch 13/16
saving weights.02-13-0.9916.hdf5
val_auc: 0.9916
Epoch 14/16
val_auc: 0.9915
Epoch 15/16
saving weights.02-15-0.9917.hdf5
val_auc: 0.9917
Epoch 16/16
val_auc: 0.9914
Train on 143614 samples, validate on 15957 samples
Epoch 1/16
saving weights.03-01-0.9665.hdf5
val_auc: 0.9665
Epoch 2/16
saving weights.03-02-0.9825.hdf5
val_auc: 0.9825
Epoch 3/16
saving weights.03-03-0.9857

saving weights.04-03-0.9846.hdf5
val_auc: 0.9846
Epoch 4/16
saving weights.04-04-0.9851.hdf5
val_auc: 0.9851
Epoch 5/16
saving weights.04-05-0.9858.hdf5
val_auc: 0.9858
Epoch 6/16
saving weights.04-06-0.9859.hdf5
val_auc: 0.9859
Epoch 7/16
saving weights.04-07-0.9864.hdf5
val_auc: 0.9864
Epoch 8/16
val_auc: 0.9859
Epoch 9/16
saving weights.04-09-0.9865.hdf5
val_auc: 0.9865
Epoch 10/16
val_auc: 0.9858
Epoch 11/16
val_auc: 0.9861
Epoch 12/16
val_auc: 0.9863
Epoch 13/16
saving weights.04-13-0.9870.hdf5
val_auc: 0.9870
Epoch 14/16
val_auc: 0.9863
Epoch 15/16
val_auc: 0.9863
Epoch 16/16
val_auc: 0.9860
Train on 143614 samples, validate on 15957 samples
Epoch 1/16
saving weights.05-01-0.9761.hdf5
val_auc: 0.9761
Epoch 2/16
saving weights.05-02-0.9843.hdf5
val_auc: 0.9843
Epoch 3/16
saving weights.05-03-0.9866.hdf5
val_auc: 0.9866
Epoch 4/16
saving weights.05-04-0.9874.hdf5
val_auc: 0.9874
Epoch 5/16
saving weights.05-05-0.9881.hdf5
val_auc: 0.9881
Epoch 6/16
saving weights.05-06-0.9890.hdf5


KeyboardInterrupt: 

Train on 151592 samples, validate on 7979 samples
Epoch 1/16
saving weights.01-0.9798.hdf5
val_auc: 0.9798
Epoch 2/16
saving weights.02-0.9856.hdf5
val_auc: 0.9856
Epoch 3/16
saving weights.03-0.9887.hdf5
val_auc: 0.9887
Epoch 4/16
saving weights.04-0.9898.hdf5
val_auc: 0.9898
Epoch 5/16
saving weights.05-0.9904.hdf5
val_auc: 0.9904
Epoch 6/16
val_auc: 0.9901
Epoch 7/16
val_auc: 0.9903
Epoch 8/16
saving weights.08-0.9904.hdf5
val_auc: 0.9904
Epoch 9/16
val_auc: 0.9903
Epoch 10/16
val_auc: 0.9903
Epoch 11/16
val_auc: 0.9904
Epoch 12/16
saving weights.12-0.9905.hdf5
val_auc: 0.9905
Epoch 13/16
saving weights.13-0.9905.hdf5
val_auc: 0.9905
Epoch 14/16
val_auc: 0.9904
Epoch 15/16
val_auc: 0.9902
Epoch 16/16
val_auc: 0.9899


In [18]:
# model.load_weights("weights.15-0.9905.hdf5")  # capsule network baseline
# model.load_weights("weights.10-0.9904.hdf5")  # rnn size increased from 128 to max sequence length
model.load_weights("weights.08-0.9906.hdf5")  # added numberbatch with restored rnn size to 128
# model.load_weights("weights.13-0.9905.hdf5")  # added glove embeddings

In [19]:
%%time 

y_pred = model.predict(X_test_padded, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)

CPU times: user 54.3 s, sys: 6.08 s, total: 1min
Wall time: 59.2 s


In [20]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "added numberbatch embeddings branch"

Successfully submitted to Toxic Comment Classification Challenge