In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd

train = pd.read_csv('train.csv')

#train = train.sample(frac=0.05)  # 157975 original total, so let's prototype models with a fraction of that
validation_fraction = 0.01  # change to 1% for training on complete training set

In [3]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [4]:
from keras.preprocessing import text, sequence

max_features = None  # 30000
maxlen = 128   # doubling to 256 worsened validation AUC score  # 100
embed_size = 300  # should match embedding file

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
import numpy as np

EMBEDDING_FILE = 'crawl-300d-2M.vec'  # fasttext

def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*file.rstrip().rsplit(' ')) for file in open(EMBEDDING_FILE))

word_index = tokenizer.word_index  # len(word_index) == 394787
nb_words = len(word_index) #min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    # if i >= max_features: 
    #     break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [6]:
from keras.models import Model
from keras.layers import Bidirectional, concatenate, CuDNNGRU, Dense, Embedding, Input, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, SpatialDropout1D
# from keras.regularizers import l2
# from keras.constraints import maxnorm

# from qrnn import QRNN

def build_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
    # x = SpatialDropout1D(0.1)(x)
    rnn_size = maxlen
    x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)  # 2nd bidirectional layer didn't help with training subsample
    # x = QRNN(rnn_size, window_size=7, return_sequences=True)(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    multiclass_label_count = 6
    outp = Dense(multiclass_label_count, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='nadam',
                  metrics=['accuracy'])

    return model

model = build_model()

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau


X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_fraction)


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch + 1, score))

            
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
checkpoint = ModelCheckpoint("weights.{epoch:02d}-{val_loss:.4f}.hdf5", 
                             verbose=1, 
                             # save_best_only=True, 
                             save_weights_only=True)
lr_reduction = ReduceLROnPlateau(patience=1, verbose=1)

In [8]:
batch_size = 16  # 1024 lowered AUC score even when tried continued training with bigger batch size after small batch size, as well as starting with big batch size and then continuing with smaller size
epochs = 8
history = model.fit(X_train, y_train, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    validation_data=(X_val, y_val),
                    callbacks=[RocAuc, 
                               checkpoint, 
                               lr_reduction], 
                    verbose=1)

  "This may consume a large amount of memory." % num_elements)


Train on 157975 samples, validate on 1596 samples
Epoch 1/8

 ROC-AUC - epoch: 1 - score: 0.987201 


Epoch 00001: saving model to weights.01-0.0484.hdf5
Epoch 2/8

 ROC-AUC - epoch: 2 - score: 0.987098 


Epoch 00002: saving model to weights.02-0.0524.hdf5
Epoch 3/8

 ROC-AUC - epoch: 3 - score: 0.984244 


Epoch 00003: saving model to weights.03-0.0540.hdf5

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 4/8

 ROC-AUC - epoch: 4 - score: 0.984149 


Epoch 00004: saving model to weights.04-0.0575.hdf5

Epoch 00004: ReduceLROnPlateau reducing learning rate to 2.0000000949949027e-05.
Epoch 5/8

 ROC-AUC - epoch: 5 - score: 0.984227 


Epoch 00005: saving model to weights.05-0.0588.hdf5

Epoch 00005: ReduceLROnPlateau reducing learning rate to 2.0000001313746906e-06.
Epoch 6/8

 ROC-AUC - epoch: 6 - score: 0.984221 


Epoch 00006: saving model to weights.06-0.0590.hdf5

Epoch 00006: ReduceLROnPlateau reducing learning rate to 2.000000222324161e-07.

In [9]:
model.load_weights("weights.01-0.0484.hdf5")

In [10]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)

In [11]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "initial baseline with max embedding matrix"

Successfully submitted to Toxic Comment Classification Challenge