In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd

train = pd.read_csv('train.csv')

# train = train.sample(frac=0.1)  # 157975 original total, so let's prototype models with a fraction of that
validation_fraction = 0.05  # 0.1  # change to 1% for training on complete training set

In [3]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")

X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test["comment_text"].str.lower()

In [4]:
%%time

from keras.preprocessing import text, sequence

max_features = None  # 30000
embed_size = 300  # should match embedding file

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


CPU times: user 23.9 s, sys: 191 ms, total: 24.1 s
Wall time: 24.1 s


In [5]:
max_sequence_length = 300  # 150  # 128  # 100, 200, 256 worsened validation AUC score  # 100

X_train_padded = sequence.pad_sequences(X_train_tokenized, maxlen=max_sequence_length)
X_test_padded = sequence.pad_sequences(X_test_tokenized, maxlen=max_sequence_length)

In [6]:
import numpy as np

word_index = tokenizer.word_index  # len(word_index) == 394787
word_count = min(max_features, len(word_index)) if max_features else len(word_index)
# embedding_matrix = np.random.uniform(-1.0, 1.0, (nb_words, embed_size))  # in case you don't want to use pre-trained embeddings

In [7]:
%%time

import numpy as np
import bcolz


def process_fasttext_line(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(matrix, embeddings_index, word_index):
    for word, i in word_index.items():
        if max_features and i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            matrix[i] = embedding_vector
    return matrix

def build_embedding_matrix(matrix, bcolz_rootdir, embeddings_filename, line_processing_function, word_index):
    try:
        matrix = bcolz.open(rootdir=bcolz_rootdir)
    except FileNotFoundError:
        embeddings_index = dict(line_processing_function(*line.rstrip().rsplit()) for line in open(embeddings_filename))
        matrix = load_embeddings(matrix, embeddings_index, word_index)
        matrix = bcolz.carray(matrix, rootdir=bcolz_rootdir)
        matrix.flush()
    return matrix


fasttext_embedding_matrix = np.zeros((word_count + 1, embed_size))
fasttext_embedding_matrix = build_embedding_matrix(fasttext_embedding_matrix, "crawl-300d-2M.bcolz", "crawl-300d-2M.vec", process_fasttext_line, word_index)


def process_numberbatch_line(key, *arr):
    word = key.split('/')[-1].replace('_', ' ')
    return word, np.asarray(arr, dtype='float32')


numberbatch_embedding_matrix = np.zeros((word_count + 1, embed_size))
numberbatch_embedding_matrix = build_embedding_matrix(numberbatch_embedding_matrix, "numberbatch-17.06.bcolz", "numberbatch-17.06.txt", process_numberbatch_line, word_index)

def process_glove_line(*arr):
    word = arr[0]
    coefficients = arr[1:]
    if arr[1] == '.':
        if arr[4] == '.':
            word = ". . . . ."
            coefficients = arr[5:]
        elif arr[3] == '.':
            word = ". . . ."
            coefficients = arr[4:]
        elif arr[2] == '.':
            word = ". . ."
            coefficients = arr[3:]
        else:
            word = ". ."
            coefficients = arr[2:]
    elif arr[1] == "name@domain.com" or arr[1] == "Killerseats.com" or arr[1] == "mylot.com" or arr[1] == "Amazon.com":
        word = arr[0] + ' ' + arr[1]
        coefficients = arr[2:]
    try:
        return word, np.asarray(coefficients, dtype='float32')
    except ValueError:
        print(arr)

glove_embedding_matrix = np.zeros((word_count + 1, embed_size))
glove_embedding_matrix = build_embedding_matrix(glove_embedding_matrix, "glove.840B.300d.bcolz", "glove.840B.300d.txt", process_glove_line, word_index)

CPU times: user 358 ms, sys: 20.3 ms, total: 378 ms
Wall time: 402 ms


In [17]:
from keras.models import Model, Sequential
from keras.engine import Layer
from keras.layers import Activation, BatchNormalization, Bidirectional, concatenate, Conv1D, CuDNNGRU, Dense, Dropout, Embedding, Flatten, Input, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, K, SpatialDropout1D
from keras.optimizers import Adam, Nadam
# from keras.regularizers import l2
# from keras.constraints import maxnorm

# from qrnn import QRNN
# from attention import AttentionWithContext


# https://www.kaggle.com/chongjiujjin/capsule-net-with-gru

def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

class Capsule(Layer):
    def __init__(self, num_capsule=10, dim_capsule=16, routings=5, kernel_size=(9, 1), share_weights=True, activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

def build_model(max_sequence_length, word_count, embed_size, embeddings, spatial_dropout=0.28, dropout=0.25):
    i = Input(shape=(max_sequence_length, ))
    fasttext = Embedding(word_count + 1, embed_size, weights=[embeddings["fasttext"][:word_count + 2, ]], trainable=False)(i)
    numberbatch = Embedding(word_count + 1, embed_size, weights=[embeddings["numberbatch"][:word_count + 2, ]], trainable=False)(i)
    # glove = Embedding(word_count + 1, embed_size, weights=[embeddings["glove"][:word_count + 2, ]], trainable=False)(i)
    fasttext = SpatialDropout1D(spatial_dropout)(fasttext)
    numberbatch = SpatialDropout1D(spatial_dropout)(numberbatch)
    # glove = SpatialDropout1D(spatial_dropout)(glove)
    rnn_size = 128  # max_sequence_length  # 140
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)  # 2nd bidirectional layer didn't help with training subsample
    # x = QRNN(rnn_size, window_size=7, return_sequences=True)(x)
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)
    # x = Bidirectional(GRU(rnn_size, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    # x = Bidirectional(GRU(64, return_sequences=True,dropout=0.3,recurrent_dropout=0.3))(x)
    # x = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(x)
    fasttext = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(fasttext)
    numberbatch = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(numberbatch)
    # glove = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(glove)
    fasttext = Capsule()(fasttext)
    numberbatch = Capsule()(numberbatch)
    # glove = Capsule()(glove)
    # attention = AttentionWithContext()(x)
    x = concatenate([
        fasttext, 
        numberbatch, 
        # glove
    ])
    x = Flatten()(x)
    d = Dropout(dropout)(x)
    multiclass_label_count = 6
    out = Dense(multiclass_label_count, activation="sigmoid")(d)
    model = Model(inputs=i, outputs=out)
    optimizer = "adam"  # Nadam(lr=1e-3)  # 'nadam'  # Nadam(lr=1e-5)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

# del model
embeddings = { "fasttext" : fasttext_embedding_matrix, 
              "numberbatch" : numberbatch_embedding_matrix,
              # "glove" : glove_embedding_matrix
             }
model = build_model(max_sequence_length, word_count, embed_size, embeddings)
# del models
# models = [fasttext_model, numberbatch_model]

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau


X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_padded, y_train, test_size=validation_fraction)


class RocAucEvaluation(Callback):
    """https://www.kaggle.com/demesgal/lstm-glove-lr-decrease-bn-cv-lb-0-047/comments"""
    
    def __init__(self, validation_data=(), max_epoch=20):
        super(Callback, self).__init__()

        self.stopped_epoch = max_epoch
        self.best = 0.0
        self.X_val, self.y_val = validation_data
        self.y_pred = np.zeros(self.y_val.shape)

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_val, verbose=1)
        """Important lines"""
        current = roc_auc_score(self.y_val, y_pred)
        logs['val_auc'] = current

        if current > self.best:  # save model
            self.best = current
            self.y_pred = y_pred
            self.stopped_epoch = epoch + 1
            filename = "weights.{epoch:02d}-{val_auc:.4f}.hdf5".format(epoch=(epoch + 1), val_auc=current)
            print("saving " + filename)
            self.model.save_weights(filename, overwrite=True)

        print("val_auc: {:.4f}".format(current))


auc = RocAucEvaluation(validation_data=(X_val, y_val))
checkpoint = ModelCheckpoint("weights.{epoch:2d}-{val_loss:.4f}.hdf5", 
                             verbose=1, 
                             # save_best_only=True, 
                             save_weights_only=True)
lr_reduction = ReduceLROnPlateau(patience=1, verbose=1)

In [16]:
batch_size = 256  # 32  # 128  # 1024 lowered AUC score even when tried continued training with bigger batch size after small batch size, as well as starting with big batch size and then continuing with smaller size
epochs = 16
auc = RocAucEvaluation(validation_data=(X_val, y_val))
history = model.fit(X_train_split, y_train_split, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    # validation_split=0.0,
                    validation_data=(X_val, y_val),
                    callbacks=[auc, 
                               # checkpoint, 
                               # lr_reduction
                              ], 
                    verbose=1)

Train on 151592 samples, validate on 7979 samples
Epoch 1/16
saving weights.01-0.9798.hdf5
val_auc: 0.9798
Epoch 2/16
saving weights.02-0.9856.hdf5
val_auc: 0.9856
Epoch 3/16
saving weights.03-0.9887.hdf5
val_auc: 0.9887
Epoch 4/16
saving weights.04-0.9898.hdf5
val_auc: 0.9898
Epoch 5/16
saving weights.05-0.9904.hdf5
val_auc: 0.9904
Epoch 6/16
val_auc: 0.9901
Epoch 7/16
val_auc: 0.9903
Epoch 8/16
saving weights.08-0.9904.hdf5
val_auc: 0.9904
Epoch 9/16
val_auc: 0.9903
Epoch 10/16
val_auc: 0.9903
Epoch 11/16
val_auc: 0.9904
Epoch 12/16
saving weights.12-0.9905.hdf5
val_auc: 0.9905
Epoch 13/16
saving weights.13-0.9905.hdf5
val_auc: 0.9905
Epoch 14/16
val_auc: 0.9904
Epoch 15/16
val_auc: 0.9902
Epoch 16/16
val_auc: 0.9899


In [18]:
# model.load_weights("weights.15-0.9905.hdf5")  # capsule network baseline
# model.load_weights("weights.10-0.9904.hdf5")  # rnn size increased from 128 to max sequence length
model.load_weights("weights.08-0.9906.hdf5")  # added numberbatch with restored rnn size to 128
# model.load_weights("weights.13-0.9905.hdf5")  # added glove embeddings

In [19]:
%%time 

y_pred = model.predict(X_test_padded, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)

CPU times: user 54.3 s, sys: 6.08 s, total: 1min
Wall time: 59.2 s


In [20]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "added numberbatch embeddings branch"

Successfully submitted to Toxic Comment Classification Challenge