In [1]:
import os
import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
train = pd.read_csv('train.csv')

# train = train.sample(frac=0.1)  # 157975 original total, so let's prototype models with a fraction of that
validation_fraction = 0.1  # change to 1% for training on complete training set

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")

X_train = train["comment_text"].str.lower()
print(len(X_train))
# X_train.to_csv("X_train.csv", index=False)
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
print(len(y_train))

X_test = test["comment_text"].str.lower()
# X_test.to_csv("X_test.csv", index=False)
print(len(X_test))

159571
159571
153164


In [5]:
X_train[:5]

0    explanation\nwhy the edits made under my usern...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    "\nmore\ni can't make any real suggestions on ...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object

In [11]:
data_directory = "data"
training_directory = data_directory + "/train"
test_directory = data_directory + "/test"

! mkdir -p {training_directory}
! mkdir -p {test_directory}

for index, row in train[["id", "comment_text"]].iterrows():
    filename = row["id"] + ".comment.txt"
    with open(training_directory + "/" + filename, "w") as file:
        file.write(row["comment_text"])

In [12]:
for index, row in test[["id", "comment_text"]].iterrows():
    filename = row["id"] + ".comment.txt"
    with open(test_directory + "/" + filename, "w") as file:
        file.write(row["comment_text"])

In [13]:
! rm -f {training_directory}/*.clean
! rm -f {test_directory}/*.clean

In [14]:
%%bash

for file in data/train/*.comment.txt
do
    ./preprocess_text.sh ${file}
done

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
%%bash

for file in data/test/*.comment.txt
do
    ./preprocess_text.sh ${file}
done

In [19]:
! find -type f -wholename './{training_directory}/*.clean' | wc -l 

159571


In [20]:
! find -type f -wholename './{test_directory}/*.clean' | wc -l

153164


In [21]:
train_bpe_preprocessed = []
for index, row in train[["id"]].iterrows():
    filename = row["id"] + ".comment.txt.clean"
    with open(training_directory + "/" + filename, "r") as file:
        comment_text = file.read()
        train_bpe_preprocessed.append({"id" : row["id"], "comment_text" : comment_text})

train_bpe_preprocessed_df = pd.DataFrame.from_records(train_bpe_preprocessed)
train_bpe_preprocessed_df.head()

Unnamed: 0,comment_text,id
0,explanation\nwhy the edits made under my usern...,0000997932d777bf
1,d'aww! he matches this background colour i'm s...,000103f0d9cfb60f
2,"hey man, i'm really not trying to edit war. it...",000113f07ec002fd
3,"""\nmore\ni can't make any real suggestions on ...",0001b41b1c6bb37e
4,"you, sir, are my hero. any chance you remember...",0001d958c54c6e35


In [22]:
len(train_bpe_preprocessed_df)

159571

In [23]:
test_bpe_preprocessed = []
for index, row in test[["id"]].iterrows():
    filename = row["id"] + ".comment.txt.clean"
    with open(test_directory + "/" + filename, "r") as file:
        comment_text = file.read()
        test_bpe_preprocessed.append({"id" : row["id"], "comment_text" : comment_text})

test_bpe_preprocessed_df = pd.DataFrame.from_records(test_bpe_preprocessed)
test_bpe_preprocessed_df.head()
print(len(test_bpe_preprocessed_df))

153164


In [27]:
X_train_bpe_preprocessed = train_bpe_preprocessed_df["comment_text"]
X_train_bpe_preprocessed[:5]

0    explanation\nwhy the edits made under my usern...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    "\nmore\ni can't make any real suggestions on ...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object

In [28]:
X_train_bpe_preprocessed.to_csv("X_train.clean", index=False)

X_test_bpe_preprocessed = test_bpe_preprocessed_df["comment_text"]
X_test_bpe_preprocessed.to_csv("X_test.clean", index=False)

In [6]:
X_train_bpe_preprocessed = pd.read_csv("X_train.clean", header=None, names=["comment_text"])["comment_text"]
X_test_bpe_preprocessed = pd.read_csv("X_test.clean", header=None, names=["comment_text"])["comment_text"]

print(len(X_train_bpe_preprocessed))
print(len(X_test_bpe_preprocessed))

159571
153164


In [7]:
X_train_bpe_preprocessed[:5]

0    explanation\nwhy the edits made under my usern...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    "\nmore\ni can't make any real suggestions on ...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object

In [8]:
%%time

from keras.preprocessing import text, sequence

max_features = None  # 30000
embed_size = 300  # should match embedding file

# tokenizer = text.Tokenizer(num_words=max_features)
# all_comments = list(X_train) + list(X_test)
# tokenizer.fit_on_texts(all_comments)
# X_train_tokenized = tokenizer.texts_to_sequences(X_train)
# X_test_tokenized = tokenizer.texts_to_sequences(X_test)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


CPU times: user 780 ms, sys: 58.9 ms, total: 838 ms
Wall time: 859 ms


In [9]:
from collections import OrderedDict
import sentencepiece as spm


class BPETokenizer(text.Tokenizer):
    """Text tokenization utility class.
    This class allows to vectorize a text corpus, by turning each
    text into either a sequence of integers (each integer being the index
    of a token in a dictionary) or into a vector where the coefficient
    for each token could be binary, based on word count, based on tf-idf...
    # Arguments
        num_words: the maximum number of words to keep, based
            on word frequency. Only the most common `num_words` words will
            be kept.
        filters: a string where each element is a character that will be
            filtered from the texts. The default is all punctuation, plus
            tabs and line breaks, minus the `'` character.
        lower: boolean. Whether to convert the texts to lowercase.
        split: character or string to use for token splitting.
        char_level: if True, every character will be treated as a token.
        oov_token: if given, it will be added to word_index and used to
            replace out-of-vocabulary words during text_to_sequence calls
    By default, all punctuation is removed, turning the texts into
    space-separated sequences of words
    (words maybe include the `'` character). These sequences are then
    split into lists of tokens. They will then be indexed or vectorized.
    `0` is a reserved index that won't be assigned to any word.
    """
    def __init__(self, num_words=None,
                 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                 lower=True,
                 split=' ',
                 char_level=False,
                 oov_token=None,
                 **kwargs):
        # Legacy support
        if 'nb_words' in kwargs:
            warnings.warn('The `nb_words` argument in `Tokenizer` '
                          'has been renamed `num_words`.')
            num_words = kwargs.pop('nb_words')
        if kwargs:
            raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
        
        self.word_counts = OrderedDict()
        self.word_docs = {}
        self.filters = filters
        self.split = split
        self.lower = lower
        self.num_words = num_words
        self.document_count = 0
        self.char_level = char_level
        self.oov_token = oov_token
        self.index_docs = {}
        
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load("en.wiki.bpe.op200000.model")
    
    def text_to_word_sequence(self, 
                              text,
                              filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                              lower=True, 
                              split=" "):
        """Converts a text to a sequence of words (or tokens).
        # Arguments
            text: Input text (string).
            filters: Sequence of characters to filter out.
            lower: Whether to convert the input to lowercase. (Unused here since input text should already be lowercased)
            split: Sentence split marker (string).
        # Returns
            A list of words (or tokens).
        """
        return self.sp.EncodeAsPieces(text)
    
    def fit_on_texts(self, texts):
        """Updates internal vocabulary based on a list of texts.
        In the case where texts contains lists, we assume each entry of the lists
        to be a token.
        Required before using `texts_to_sequences` or `texts_to_matrix`.
        # Arguments
            texts: can be a list of strings,
                a generator of strings (for memory-efficiency),
                or a list of list of strings.
        """
        for text in texts:
            self.document_count += 1
            if self.char_level or isinstance(text, list):
                seq = text
            else:
                seq = self.text_to_word_sequence(text,
                                            self.filters,
                                            self.lower,
                                            self.split)
            for w in seq:
                if w in self.word_counts:
                    self.word_counts[w] += 1
                else:
                    self.word_counts[w] = 1
            for w in set(seq):
                if w in self.word_docs:
                    self.word_docs[w] += 1
                else:
                    self.word_docs[w] = 1
        
        wcounts = list(self.word_counts.items())
        wcounts.sort(key=lambda x: x[1], reverse=True)
        sorted_voc = [wc[0] for wc in wcounts]
        # note that index 0 is reserved, never assigned to an existing word
        self.word_index = dict(list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
        
        if self.oov_token is not None:
            i = self.word_index.get(self.oov_token)
            if i is None:
                self.word_index[self.oov_token] = len(self.word_index) + 1
        
        for w, c in list(self.word_docs.items()):
            self.index_docs[self.word_index[w]] = c

bpe_tokenizer = BPETokenizer(num_words=max_features)

In [10]:
all_comments_bpe_preprocessed = list(X_train_bpe_preprocessed) + list(X_test_bpe_preprocessed)

In [11]:
bpe_tokenizer.fit_on_texts(all_comments_bpe_preprocessed)
X_train_tokenized_bpe = bpe_tokenizer.texts_to_sequences(X_train_bpe_preprocessed)
X_test_tokenized_bpe = bpe_tokenizer.texts_to_sequences(X_test_bpe_preprocessed)

In [12]:
max_sequence_length = 300  # 150  # 128  # 100, 200, 256 worsened validation AUC score  # 100

X_train_padded_bpe = sequence.pad_sequences(X_train_tokenized_bpe, maxlen=max_sequence_length)
X_test_padded_bpe = sequence.pad_sequences(X_test_tokenized_bpe, maxlen=max_sequence_length)

In [13]:
import numpy as np

bpe_word_index = bpe_tokenizer.word_index  # len(bpe_word_index) == 162739
word_count = min(max_features, len(bpe_word_index)) if max_features else len(bpe_word_index)
# embedding_matrix = np.random.uniform(-1.0, 1.0, (nb_words, embed_size))  # in case you don't want to use pre-trained embeddings

In [14]:
len(bpe_word_index)

162739

In [15]:
%%time

import numpy as np
import bcolz


def process_fasttext_line(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(matrix, embeddings_index, word_index):
    for word, i in word_index.items():
        if max_features and i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            matrix[i] = embedding_vector
    return matrix

def build_embedding_matrix(matrix, bcolz_rootdir, embeddings_filename, line_processing_function, word_index):
    try:
        matrix = bcolz.open(rootdir=bcolz_rootdir)
    except FileNotFoundError:
        embeddings_index = dict(line_processing_function(*line.rstrip().rsplit()) for line in open(embeddings_filename))
        matrix = load_embeddings(matrix, embeddings_index, word_index)
        matrix = bcolz.carray(matrix, rootdir=bcolz_rootdir)
        matrix.flush()
    return matrix

bpe_embedding_matrix = np.zeros((word_count + 1, embed_size))
bpe_embedding_matrix = build_embedding_matrix(bpe_embedding_matrix, "en.wiki.bpe.op200000.d300.w2v.bcolz", "en.wiki.bpe.op200000.d300.w2v.txt", process_fasttext_line, bpe_word_index)

CPU times: user 330 ms, sys: 16 ms, total: 346 ms
Wall time: 368 ms


In [16]:
from keras.models import Model, Sequential
from keras.layers import Activation, BatchNormalization, Bidirectional, concatenate, Conv1D, CuDNNGRU, Dense, Dropout, Embedding, Flatten, Input, GlobalAveragePooling1D, GlobalMaxPooling1D, GRU, K, SpatialDropout1D
from keras.optimizers import Adam, Nadam
# from keras.regularizers import l2
# from keras.constraints import maxnorm

# from qrnn import QRNN
# from attention import AttentionWithContext
from capsnet import Capsule

def build_model(max_sequence_length, word_count, embed_size, embeddings, spatial_dropout=0.28, dropout=0.25):
    i = Input(shape=(max_sequence_length, ))
    # fasttext = Embedding(word_count + 1, embed_size, weights=[embeddings["fasttext"][:word_count + 2, ]], trainable=False)(i)
    # numberbatch = Embedding(word_count + 1, embed_size, weights=[embeddings["numberbatch"][:word_count + 2, ]], trainable=False)(i)
    # glove = Embedding(word_count + 1, embed_size, weights=[embeddings["glove"][:word_count + 2, ]], trainable=False)(i)
    bpe = fasttext = Embedding(word_count + 1, embed_size, weights=[embeddings["bpe"][:word_count + 2, ]], trainable=False)(i)
    # fasttext = SpatialDropout1D(spatial_dropout)(fasttext)
    # numberbatch = SpatialDropout1D(spatial_dropout)(numberbatch)
    # glove = SpatialDropout1D(spatial_dropout)(glove)
    bpe = SpatialDropout1D(spatial_dropout)(bpe)
    rnn_size = 128  # max_sequence_length  # 140
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)  # 2nd bidirectional layer didn't help with training subsample
    # x = QRNN(rnn_size, window_size=7, return_sequences=True)(x)
    # x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)
    # x = Bidirectional(GRU(rnn_size, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    # x = Bidirectional(GRU(64, return_sequences=True,dropout=0.3,recurrent_dropout=0.3))(x)
    # x = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(x)
    # fasttext = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(fasttext)
    # numberbatch = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(numberbatch)
    # glove = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(glove)
    bpe = Bidirectional(GRU(rnn_size, activation='relu', return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(bpe)
    # fasttext = Capsule()(fasttext)
    # numberbatch = Capsule()(numberbatch)
    # glove = Capsule()(glove)
    bpe = Capsule()(bpe)
    # attention = AttentionWithContext()(x)
    # x = concatenate([
    #     fasttext, 
    #     numberbatch, 
        # glove
    # ])
    x = bpe
    x = Flatten()(x)
    d = Dropout(dropout)(x)
    multiclass_label_count = 6
    out = Dense(multiclass_label_count, activation="sigmoid")(d)
    model = Model(inputs=i, outputs=out)
    optimizer = "adam"  # Nadam(lr=1e-3)  # 'nadam'  # Nadam(lr=1e-5)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

# del model
embeddings = { # "fasttext" : fasttext_embedding_matrix, 
              # "numberbatch" : numberbatch_embedding_matrix,
              # "glove" : glove_embedding_matrix
              "bpe" : bpe_embedding_matrix
             }
# model = build_model(max_sequence_length, word_count, embed_size, embeddings)
# del models
# models = [fasttext_model, numberbatch_model]

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau


# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_padded, y_train, test_size=validation_fraction)


class RocAucEvaluation(Callback):
    """https://www.kaggle.com/demesgal/lstm-glove-lr-decrease-bn-cv-lb-0-047/comments"""
    
    def __init__(self, validation_data=(), max_epoch=20, cross_validation_fold=None):
        super(Callback, self).__init__()

        self.stopped_epoch = max_epoch
        self.best = 0.0
        self.X_val, self.y_val = validation_data
        self.y_pred = np.zeros(self.y_val.shape)
        self.cross_validation_fold = cross_validation_fold  # integer

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_val, verbose=1)
        """Important lines"""
        current = roc_auc_score(self.y_val, y_pred)
        logs['val_auc'] = current

        if current > self.best:  # save model
            self.best = current
            self.y_pred = y_pred
            self.stopped_epoch = epoch + 1
            filename = "bpe.weights.{fold:02d}-{epoch:02d}-{val_auc:.4f}.hdf5".format(fold=self.cross_validation_fold, epoch=(epoch + 1), val_auc=current) if self.cross_validation_fold is not None else "bpe.weights.{epoch:02d}-{val_auc:.4f}.hdf5".format(epoch=(epoch + 1), val_auc=current) 
            print("saving " + filename)
            self.model.save_weights(filename, overwrite=True)

        print("val_auc: {:.4f}".format(current))

In [None]:
batch_size = 256  # 32  # 128  # 1024 lowered AUC score even when tried continued training with bigger batch size after small batch size, as well as starting with big batch size and then continuing with smaller size
epochs = 16

kf = KFold(n_splits=10)
fold = 0
for train_index, val_index in kf.split(X_train_padded_bpe, y_train):
    checkpoint = ModelCheckpoint("weights.{epoch:2d}-{val_loss:.4f}.hdf5", 
                                 verbose=1, 
                                 # save_best_only=True, 
                                 save_weights_only=True)
    lr_reduction = ReduceLROnPlateau(patience=1, verbose=1)
    
    X_train_split, X_val_split = X_train_padded_bpe[train_index], X_train_padded_bpe[val_index]
    y_train_split, y_val_split = y_train[train_index], y_train[val_index]
    
    auc = RocAucEvaluation(validation_data=(X_val_split, y_val_split), cross_validation_fold=fold)
    model = build_model(max_sequence_length, word_count, embed_size, embeddings)
    history = model.fit(X_train_split, y_train_split, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    # validation_split=0.0,
                    validation_data=(X_val_split, y_val_split),
                    callbacks=[auc, 
                               # checkpoint, 
                               # lr_reduction
                              ], 
                    verbose=1)
    del model
    K.clear_session()
    fold += 1

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Train on 143613 samples, validate on 15958 samples
Epoch 1/16
saving bpe.weights.00-01-0.9478.hdf5
val_auc: 0.9478
Epoch 2/16
saving bpe.weights.00-02-0.9654.hdf5
val_auc: 0.9654
Epoch 3/16
saving bpe.weights.00-03-0.9687.hdf5
val_auc: 0.9687
Epoch 4/16
saving bpe.weights.00-04-0.9693.hdf5
val_auc: 0.9693
Epoch 5/16
saving bpe.weights.00-05-0.9731.hdf5
val_auc: 0.9731
Epoch 6/16
saving bpe.weights.00-06-0.9740.hdf5
val_auc: 0.9740
Epoch 7/16
saving bpe.weights.00-07-0.9750.hdf5
val_auc: 0.9750
Epoch 8/16
saving bpe.weights.00-08-0.9754.hdf5
val_auc: 0.9754
Epoch 9/16
saving bpe.weights.00-09-0.9765.hdf5
val_auc: 0.9765
Epoch 10/16
val_auc: 0.9762
Epoch 11/16
saving bpe.weights.00-11-0.9768.hdf5
val_auc: 0.9768
Epoch 12/16
saving bpe.weights.00-12-0.9768.hdf5
val_auc: 0.9768
Epoch 13/16
saving bpe.weights.00-13-0.9773.hdf5
val_auc: 0.9773
Epoch 14/16
val_auc: 0.9768
Epoch 15/16
saving bpe.weights.00-15-0.

Train on 143614 samples, validate on 15957 samples
Epoch 1/16
saving bpe.weights.02-01-0.9556.hdf5
val_auc: 0.9556
Epoch 2/16
saving bpe.weights.02-02-0.9670.hdf5
val_auc: 0.9670
Epoch 3/16
saving bpe.weights.02-03-0.9721.hdf5
val_auc: 0.9721
Epoch 4/16
saving bpe.weights.02-04-0.9744.hdf5
val_auc: 0.9744
Epoch 5/16
saving bpe.weights.02-05-0.9761.hdf5
val_auc: 0.9761
Epoch 6/16
saving bpe.weights.02-06-0.9771.hdf5
val_auc: 0.9771
Epoch 7/16
saving bpe.weights.02-07-0.9780.hdf5
val_auc: 0.9780
Epoch 8/16
saving bpe.weights.02-08-0.9784.hdf5
val_auc: 0.9784
Epoch 9/16
saving bpe.weights.02-09-0.9784.hdf5
val_auc: 0.9784
Epoch 10/16
val_auc: 0.9781
Epoch 11/16
val_auc: 0.9783
Epoch 12/16
saving bpe.weights.02-12-0.9785.hdf5
val_auc: 0.9785
Epoch 13/16
val_auc: 0.9785
Epoch 14/16
saving bpe.weights.02-14-0.9786.hdf5
val_auc: 0.9786
Epoch 15/16
saving bpe.weights.02-15-0.9787.hdf5
val_auc: 0.9787
Epoch 16/16
val_auc: 0.9787
Train on 143614 samples, validate on 15957 samples
Epoch 1/16

Train on 151592 samples, validate on 7979 samples
Epoch 1/16
saving weights.01-0.9798.hdf5
val_auc: 0.9798
Epoch 2/16
saving weights.02-0.9856.hdf5
val_auc: 0.9856
Epoch 3/16
saving weights.03-0.9887.hdf5
val_auc: 0.9887
Epoch 4/16
saving weights.04-0.9898.hdf5
val_auc: 0.9898
Epoch 5/16
saving weights.05-0.9904.hdf5
val_auc: 0.9904
Epoch 6/16
val_auc: 0.9901
Epoch 7/16
val_auc: 0.9903
Epoch 8/16
saving weights.08-0.9904.hdf5
val_auc: 0.9904
Epoch 9/16
val_auc: 0.9903
Epoch 10/16
val_auc: 0.9903
Epoch 11/16
val_auc: 0.9904
Epoch 12/16
saving weights.12-0.9905.hdf5
val_auc: 0.9905
Epoch 13/16
saving weights.13-0.9905.hdf5
val_auc: 0.9905
Epoch 14/16
val_auc: 0.9904
Epoch 15/16
val_auc: 0.9902
Epoch 16/16
val_auc: 0.9899


In [18]:
# model.load_weights("weights.15-0.9905.hdf5")  # capsule network baseline
# model.load_weights("weights.10-0.9904.hdf5")  # rnn size increased from 128 to max sequence length
model.load_weights("weights.08-0.9906.hdf5")  # added numberbatch with restored rnn size to 128
# model.load_weights("weights.13-0.9905.hdf5")  # added glove embeddings

In [19]:
%%time 

y_pred = model.predict(X_test_padded, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)

CPU times: user 54.3 s, sys: 6.08 s, total: 1min
Wall time: 59.2 s


In [20]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "added numberbatch embeddings branch"

Successfully submitted to Toxic Comment Classification Challenge