In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd

train = pd.read_csv('train.csv')

train = train.sample(frac=0.05)  # 157975 original total, so let's prototype models with a fraction of that
validation_fraction = 0.1  # change to 1% for training on complete training set

In [3]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [4]:
from keras.preprocessing import text, sequence

max_features = None  # 30000
maxlen = 128  # 100
embed_size = 300  # should match embedding file

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
import numpy as np

EMBEDDING_FILE = 'crawl-300d-2M.vec'  # fasttext

def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*file.rstrip().rsplit(' ')) for file in open(EMBEDDING_FILE))

word_index = tokenizer.word_index  # len(word_index) == 394787
nb_words = len(word_index) #min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    # if i >= max_features: 
    #     break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [19]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

def build_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
    # x = SpatialDropout1D(0.2)(x)
    rnn_size = 128
    x = Bidirectional(GRU(rnn_size, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    multiclass_label_count = 6
    outp = Dense(multiclass_label_count, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='nadam',
                  metrics=['accuracy'])

    return model

del model
model = build_model()

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau


X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, test_size=validation_fraction)


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch + 1, score))

            
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
checkpoint = ModelCheckpoint("weights.{epoch:02d}-{val_loss:.4f}.hdf5", 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=True)
lr_reduction = ReduceLROnPlateau(patience=1, verbose=1)

In [20]:
batch_size = 16
epochs = 8
history = model.fit(X_tra, y_tra, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    validation_data=(X_val, y_val),
                    callbacks=[RocAuc, 
                               #checkpoint, 
                               lr_reduction], 
                    verbose=1)

Train on 7181 samples, validate on 798 samples
Epoch 1/8

 ROC-AUC - epoch: 1 - score: 0.979808 

Epoch 2/8

 ROC-AUC - epoch: 2 - score: 0.985244 

Epoch 3/8

 ROC-AUC - epoch: 3 - score: 0.989557 

Epoch 4/8

 ROC-AUC - epoch: 4 - score: 0.985796 

Epoch 5/8

 ROC-AUC - epoch: 5 - score: 0.985607 


Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/8

 ROC-AUC - epoch: 6 - score: 0.985155 


Epoch 00006: ReduceLROnPlateau reducing learning rate to 2.0000000949949027e-05.
Epoch 7/8

 ROC-AUC - epoch: 7 - score: 0.985092 


Epoch 00007: ReduceLROnPlateau reducing learning rate to 2.0000001313746906e-06.
Epoch 8/8

 ROC-AUC - epoch: 8 - score: 0.985085 


Epoch 00008: ReduceLROnPlateau reducing learning rate to 2.000000222324161e-07.


In [9]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)