In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import pandas as pd

train = pd.read_csv('train.csv')

# train = train.sample(frac=0.1)  # 157975 original total, so let's prototype models with a fraction of that
validation_fraction = 0.1  # change to 1% for training on complete training set

In [3]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")

X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test["comment_text"].str.lower()

In [4]:
%%time

from keras.preprocessing import text, sequence

max_features = None  # 30000
embed_size = 300  # should match embedding file

tokenizer = text.Tokenizer(num_words=max_features)
all_comments = list(X_train) + list(X_test)
tokenizer.fit_on_texts(all_comments)
X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


CPU times: user 27.4 s, sys: 286 ms, total: 27.7 s
Wall time: 27.7 s


In [5]:
max_sequence_length = 300  # 150  # 128  # 100, 200, 256 worsened validation AUC score  # 100

X_train_padded = sequence.pad_sequences(X_train_tokenized, maxlen=max_sequence_length)
X_test_padded = sequence.pad_sequences(X_test_tokenized, maxlen=max_sequence_length)

In [6]:
import numpy as np

word_index = tokenizer.word_index  # len(word_index) == 394787
word_count = min(max_features, len(word_index)) if max_features else len(word_index)
# embedding_matrix = np.random.uniform(-1.0, 1.0, (nb_words, embed_size))  # in case you don't want to use pre-trained embeddings

In [7]:
%%time

import numpy as np
import bcolz


def process_fasttext_line(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(matrix, embeddings_index, word_index):
    for word, i in word_index.items():
        if max_features and i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            matrix[i] = embedding_vector
    return matrix

def build_embedding_matrix(matrix, bcolz_rootdir, embeddings_filename, line_processing_function, word_index):
    try:
        matrix = bcolz.open(rootdir=bcolz_rootdir)
    except FileNotFoundError:
        embeddings_index = dict(line_processing_function(*line.rstrip().rsplit()) for line in open(embeddings_filename))
        matrix = load_embeddings(matrix, embeddings_index, word_index)
        matrix = bcolz.carray(matrix, rootdir=bcolz_rootdir)
        matrix.flush()
    return matrix


fasttext_embedding_matrix = np.zeros((word_count + 1, embed_size))
fasttext_embedding_matrix = build_embedding_matrix(fasttext_embedding_matrix, "crawl-300d-2M.bcolz", "crawl-300d-2M.vec", process_fasttext_line, word_index)


def process_numberbatch_line(key, *arr):
    word = key.split('/')[-1].replace('_', ' ')
    return word, np.asarray(arr, dtype='float32')


numberbatch_embedding_matrix = np.zeros((word_count + 1, embed_size))
numberbatch_embedding_matrix = build_embedding_matrix(numberbatch_embedding_matrix, "numberbatch-17.06.bcolz", "numberbatch-17.06.txt", process_numberbatch_line, word_index)

def process_glove_line(*arr):
    word = arr[0]
    coefficients = arr[1:]
    if arr[1] == '.':
        if arr[4] == '.':
            word = ". . . . ."
            coefficients = arr[5:]
        elif arr[3] == '.':
            word = ". . . ."
            coefficients = arr[4:]
        elif arr[2] == '.':
            word = ". . ."
            coefficients = arr[3:]
        else:
            word = ". ."
            coefficients = arr[2:]
    elif arr[1] == "name@domain.com" or arr[1] == "Killerseats.com" or arr[1] == "mylot.com" or arr[1] == "Amazon.com":
        word = arr[0] + ' ' + arr[1]
        coefficients = arr[2:]
    try:
        return word, np.asarray(coefficients, dtype='float32')
    except ValueError:
        print(arr)

glove_embedding_matrix = np.zeros((word_count + 1, embed_size))
glove_embedding_matrix = build_embedding_matrix(glove_embedding_matrix, "glove.840B.300d.bcolz", "glove.840B.300d.txt", process_glove_line, word_index)

CPU times: user 378 ms, sys: 12.3 ms, total: 390 ms
Wall time: 408 ms


In [8]:
from keras.models import Model, Sequential
from keras.layers import Activation, BatchNormalization, Bidirectional, concatenate, Conv1D, CuDNNGRU, Dense, Dropout, Embedding, Flatten, Input, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, K, SpatialDropout1D
from keras.optimizers import Adam, Nadam
# from keras.regularizers import l2
# from keras.constraints import maxnorm

# from qrnn import QRNN
# from attention import AttentionWithContext
from capsnet import Capsule

def build_model(max_sequence_length, word_count, embed_size, embeddings, spatial_dropout=0.28, dropout=0.25):
    i = Input(shape=(max_sequence_length, ))
    fasttext = Embedding(word_count + 1, embed_size, weights=[embeddings["fasttext"][:word_count + 2, ]], trainable=False)(i)
    numberbatch = Embedding(word_count + 1, embed_size, weights=[embeddings["numberbatch"][:word_count + 2, ]], trainable=False)(i)
    # glove = Embedding(word_count + 1, embed_size, weights=[embeddings["glove"][:word_count + 2, ]], trainable=False)(i)
    fasttext = SpatialDropout1D(spatial_dropout)(fasttext)
    numberbatch = SpatialDropout1D(spatial_dropout)(numberbatch)
    # glove = SpatialDropout1D(spatial_dropout)(glove)
    rnn_size = 128  # max_sequence_length  # 140
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)  # 2nd bidirectional layer didn't help with training subsample
    # x = QRNN(rnn_size, window_size=7, return_sequences=True)(x)
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)
    # x = Bidirectional(GRU(rnn_size, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    # x = Bidirectional(GRU(64, return_sequences=True,dropout=0.3,recurrent_dropout=0.3))(x)
    # x = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(x)
    fasttext = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(fasttext)
    numberbatch = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(numberbatch)
    # glove = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(glove)
    fasttext = Capsule()(fasttext)
    numberbatch = Capsule()(numberbatch)
    # glove = Capsule()(glove)
    # attention = AttentionWithContext()(x)
    x = concatenate([
        fasttext, 
        numberbatch, 
        # glove
    ])
    x = Flatten()(x)
    d = Dropout(dropout)(x)
    multiclass_label_count = 6
    out = Dense(multiclass_label_count, activation="sigmoid")(d)
    model = Model(inputs=i, outputs=out)
    optimizer = "adam"  # Nadam(lr=1e-3)  # 'nadam'  # Nadam(lr=1e-5)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

# del model
embeddings = { "fasttext" : fasttext_embedding_matrix, 
              "numberbatch" : numberbatch_embedding_matrix,
              # "glove" : glove_embedding_matrix
             }
model = build_model(max_sequence_length, word_count, embed_size, embeddings)
# del models
# models = [fasttext_model, numberbatch_model]

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau


# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_padded, y_train, test_size=validation_fraction)


class RocAucEvaluation(Callback):
    """https://www.kaggle.com/demesgal/lstm-glove-lr-decrease-bn-cv-lb-0-047/comments"""
    
    def __init__(self, validation_data=(), max_epoch=20, cross_validation_fold=None):
        super(Callback, self).__init__()

        self.stopped_epoch = max_epoch
        self.best = 0.0
        self.X_val, self.y_val = validation_data
        self.y_pred = np.zeros(self.y_val.shape)
        self.cross_validation_fold = cross_validation_fold  # current fold number (integer)

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_val, verbose=1)
        """Important lines"""
        current = roc_auc_score(self.y_val, y_pred)
        logs['val_auc'] = current

        if current > self.best:  # save model
            self.best = current
            self.y_pred = y_pred
            self.stopped_epoch = epoch + 1
            filename = "weights.{epoch:02d}-{val_auc:.4f}.hdf5".format(epoch=(epoch + 1), val_auc=current)
            if self.cross_validation_fold is not None:
                filename = "weights.{fold:02d}-{epoch:02d}-{val_auc:.4f}.hdf5".format(fold=self.cross_validation_fold, epoch=(epoch + 1), val_auc=current)  
            print("saving " + filename)
            self.model.save_weights(filename, overwrite=True)

        print("val_auc: {:.4f}".format(current))

In [24]:
%%time

y_pred = []
best_epochs_aucs = ["13-0.9904", "09-0.9912", "15-0.9917", "11-0.9891", "13-0.9870", "06-0.9890"]
for i, string in enumerate(best_epochs_aucs):
    filename = "weights.0" + str(i) + "-" + string + ".hdf5"
    model.load_weights(filename)
    y_pred.append(model.predict(X_test_padded, batch_size=1024))

CPU times: user 5min 25s, sys: 54.6 s, total: 6min 20s
Wall time: 6min 15s


In [25]:
avg = np.mean(y_pred, axis=0, dtype=np.float64)

In [16]:
avg[:5]

array([[9.98136640e-01, 5.54723285e-01, 9.88775641e-01, 2.58751627e-01,
        9.69367057e-01, 7.22821534e-01],
       [3.85550862e-04, 3.13692194e-05, 1.13252223e-04, 4.93097951e-06,
        6.81916908e-05, 3.07749438e-05],
       [2.87464583e-04, 6.40841508e-05, 1.59963225e-04, 9.45015040e-06,
        7.99759846e-05, 2.88790961e-05],
       [1.60294645e-04, 2.77619270e-05, 1.13022357e-04, 6.64743420e-05,
        7.52192796e-05, 1.68690364e-05],
       [4.89587826e-03, 1.10038109e-04, 8.88845309e-04, 9.46112568e-05,
        1.62319113e-04, 1.99636927e-05]])

In [19]:
y_pred

[array([[9.9721247e-01, 5.6057703e-01, 9.8593444e-01, 2.6531163e-01,
         9.6396595e-01, 6.5808964e-01],
        [3.9361967e-04, 3.0636242e-05, 6.7308465e-05, 2.7412323e-06,
         9.2376627e-05, 2.8745797e-05],
        [2.4020483e-04, 6.5258209e-05, 1.2697655e-04, 8.2108581e-06,
         6.4704713e-05, 3.8468428e-05],
        ...,
        [4.0285816e-04, 1.9132201e-05, 4.6927566e-04, 9.0065969e-06,
         1.4896499e-04, 2.6274722e-05],
        [1.5903384e-03, 1.3613405e-04, 6.7367509e-04, 2.2098635e-04,
         7.0680992e-04, 6.3180458e-03],
        [9.8556203e-01, 2.1041648e-02, 9.2519379e-01, 2.7988341e-03,
         7.1517563e-01, 4.5923088e-03]], dtype=float32),
 array([[9.98364151e-01, 5.15149117e-01, 9.89853084e-01, 2.63934106e-01,
         9.69017029e-01, 7.06804812e-01],
        [5.54008700e-04, 3.33567841e-05, 1.20732504e-04, 7.68083646e-06,
         8.33393278e-05, 3.39221151e-05],
        [3.33472010e-04, 5.28167693e-05, 2.12035302e-04, 1.32755113e-05,
         9.67

In [26]:
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = avg
submission.to_csv('submission.csv', index=False)

In [None]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "(5 + 8/16) of 10 fold cross validation average ensemble"