In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd

train = pd.read_csv('train.csv')

# train = train.sample(frac=0.1)  # 157975 original total, so let's prototype models with a fraction of that
validation_fraction = 0.01  # change to 1% for training on complete training set

In [3]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [4]:
from keras.preprocessing import text, sequence

max_features = None  # 30000
embed_size = 300  # should match embedding file

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
maxlen = 128  # 100, 200, 256 worsened validation AUC score  # 100

X_train_padded = sequence.pad_sequences(X_train_tokenized, maxlen=maxlen)
X_test_padded = sequence.pad_sequences(X_test_tokenized, maxlen=maxlen)

In [6]:
import numpy as np

word_index = tokenizer.word_index  # len(word_index) == 394787
nb_words = len(word_index)  # min(max_features, len(word_index))
# embedding_matrix = np.random.uniform(-1.0, 1.0, (nb_words, embed_size))  # in case you don't want to use pre-trained embeddings
embedding_matrix = np.zeros((nb_words, embed_size))

In [7]:
import numpy as np

EMBEDDING_FILE = 'crawl-300d-2M.vec'  # fasttext

def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*file.rstrip().rsplit(' ')) for file in open(EMBEDDING_FILE))

for word, i in word_index.items():
    # if i >= max_features: 
    #     break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [15]:
from keras.models import Model
from keras.layers import Bidirectional, concatenate, CuDNNGRU, Dense, Dropout, Embedding, Flatten, Input, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, SpatialDropout1D
from keras.optimizers import Nadam
# from keras.regularizers import l2
# from keras.constraints import maxnorm

# from qrnn import QRNN
# from attention import AttentionWithContext

def build_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
    # x = SpatialDropout1D(0.1)(x)  # didn't improve val_auc
    rnn_size = maxlen
    x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)  # 2nd bidirectional layer didn't help with training subsample
    # x = QRNN(rnn_size, window_size=7, return_sequences=True)(x)
    pool_branch = x
    # attention = AttentionWithContext()(x)
    # flattened = Flatten()(x)
    avg_pool = GlobalAveragePooling1D()(pool_branch)
    max_pool = GlobalMaxPooling1D()(pool_branch)
    concat = concatenate([avg_pool, max_pool])  # , attention])  # flattened])  # no benefit
    # d = Dropout(0.01)(concat)  # didn't improve val_auc
    multiclass_label_count = 6
    outp = Dense(multiclass_label_count, activation="sigmoid")(concat)
    
    model = Model(inputs=inp, outputs=outp)
    optimizer = 'nadam'  # lr=1e-5)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    return model

del model
model = build_model()

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau


X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_padded, y_train, test_size=validation_fraction)


class RocAucEvaluation(Callback):
    """https://www.kaggle.com/demesgal/lstm-glove-lr-decrease-bn-cv-lb-0-047/comments"""
    
    def __init__(self, validation_data=(), max_epoch=20):
        super(Callback, self).__init__()

        self.stopped_epoch = max_epoch
        self.best = 0.0
        self.X_val, self.y_val = validation_data
        self.y_pred = np.zeros(self.y_val.shape)

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_val, verbose=1)
        """Important lines"""
        current = roc_auc_score(self.y_val, y_pred)
        logs['val_auc'] = current

        if current > self.best:  # save model
            self.best = current
            self.y_pred = y_pred
            self.stopped_epoch = epoch + 1
            filename = "weights.{epoch:02d}-{val_auc:.4f}.hdf5".format(epoch=(epoch + 1), val_auc=current)
            print("saving " + filename)
            self.model.save_weights(filename, overwrite=True)

        print("val_auc: {:.4f}".format(current))


auc = RocAucEvaluation(validation_data=(X_val, y_val))
checkpoint = ModelCheckpoint("weights.{epoch:2d}-{val_loss:.4f}.hdf5", 
                             verbose=1, 
                             # save_best_only=True, 
                             save_weights_only=True)
lr_reduction = ReduceLROnPlateau(patience=1, verbose=1)

In [16]:
batch_size = 64  # 1024 lowered AUC score even when tried continued training with bigger batch size after small batch size, as well as starting with big batch size and then continuing with smaller size
epochs = 2
history = model.fit(X_train_split, y_train_split, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    # validation_split=0.0,
                    validation_data=(X_val, y_val),
                    callbacks=[auc, 
                               # checkpoint, 
                               # lr_reduction
                              ], 
                    verbose=1)

  "This may consume a large amount of memory." % num_elements)


Train on 157975 samples, validate on 1596 samples
Epoch 1/2
val_auc: 0.984832
Epoch 2/2
val_auc: 0.986425


In [17]:
# model.load_weights("weights.01-0.987421.hdf5")  # batch size 16
model.load_weights("weights.01-0.9882.hdf5")  # batch size 32


In [18]:
y_pred = model.predict(X_test_padded, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)

In [19]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "increased batch size to 32 from 16"

Successfully submitted to Toxic Comment Classification Challenge

In [16]:
history = model.fit(X_train_split, y_train_split, 
                    batch_size=batch_size, 
                    epochs=2, 
                    validation_split=0.0,
                    # validation_data=(X_val, y_val),
                    # callbacks=[auc, 
                               # checkpoint, 
                               # lr_reduction
                    #           ], 
                    verbose=1,
                    initial_epoch=1
                   )

Epoch 2/2
 13232/159571 [=>............................] - ETA: 24:43 - loss: 0.0361 - acc: 0.9861

KeyboardInterrupt: 

In [None]:
y_pred = model.predict(X_test_padded, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)

In [None]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "added attention & reduced batch size to 8 from 16, whole training set epoch 2/2"