# Overview
The basic idea of this notebook is to transform the data from a sequence of letters into possible categories using a CNN. We use letters instead of words since we say in the [mothjer](https://www.kaggle.com/fcostartistican/don-t-mess-with-my-mothjer) notebook that words are often misspelled or written differently so looking at character level correlations might work better.  We utilize Atrous Convolutions since they can account for larger spacings between relevant words and ideas. For the model we focus on individual letters and ngrams sized 1-10, but the model could easily be expanded to handle larger differences.

In [2]:
#@title Default title text
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Bidirectional, CuDNNGRU, GlobalAveragePooling1D
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Dropout, concatenate
from keras.preprocessing import text as keras_text, sequence as keras_seq
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [11]:
!pip install keras --upgrade

Requirement already up-to-date: keras in /home/stgc/anaconda2/lib/python2.7/site-packages
Collecting pyyaml (from keras)
  Using cached PyYAML-3.12.tar.gz
Collecting six>=1.9.0 (from keras)
  Using cached six-1.11.0-py2.py3-none-any.whl
Requirement already up-to-date: scipy>=0.14 in /home/stgc/anaconda2/lib/python2.7/site-packages (from keras)
Collecting numpy>=1.9.1 (from keras)
  Downloading numpy-1.14.2-cp27-cp27mu-manylinux1_x86_64.whl (12.1MB)
[K    100% |████████████████████████████████| 12.1MB 154kB/s ta 0:00:0101
[?25hBuilding wheels for collected packages: pyyaml
  Running setup.py bdist_wheel for pyyaml ... [?25ldone
[?25h  Stored in directory: /home/stgc/.cache/pip/wheels/2c/f7/79/13f3a12cd723892437c0cfbde1230ab4d82947ff7b3839a4fc
Successfully built pyyaml
Installing collected packages: pyyaml, six, numpy
  Found existing installation: PyYAML 3.11
    Uninstalling PyYAML-3.11:
      Successfully uninstalled PyYAML-3.11
  Found existing installation: six 1.10.0
    Uninst

In [0]:
# define network parameters
max_features = 64
maxlen = 512

In [0]:
#@title Default title text
!apt-get install graphviz -y

In [0]:
!pip install pydrive

In [0]:
!pip install keras

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

     # 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
#if upload from local
from google.colab import files
uploaded = files.upload()


In [0]:
!ls

In [0]:

files.download('aug_gruconv_meta.csv')

In [0]:

csv_import = drive.CreateFile({'id':'12s1UHj-QmX6ABYDH7bq4CmhQaPEfvqz9'})

csv_import.GetContentFile('clean_test_third.csv')

In [0]:
csv_import = drive.CreateFile({'id':'1jzsPgPBk13gXl4caOlDOTX1M3Eo1S-Nt'})
csv_import.GetContentFile('clean_train_ori_third.csv')

In [0]:

csv_import = drive.CreateFile({'id':'1eJQ32N7O1uGZV7vYi8zvISCJ3KkdYoQp'})

csv_import.GetContentFile('test_translated_clean.csv')

In [0]:

csv_import = drive.CreateFile({'id':'1q1btNUTkqQ0-Mfr7RiiuNZ7i3sai1a0L'})

csv_import.GetContentFile('train_translated_clean.csv')

In [0]:
!ls


# Load and Preprocessing Steps
Here we load the data and fill in the misisng values

In [0]:
%%time
train = pd.read_csv("clean_train_ori_third.csv")
test = pd.read_csv("clean_test_third.csv")
train = train.sample(frac=1)

list_sentences_train = train["comment_text"].fillna("unknown").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("unknown").values

In [0]:
!nvidia-smi

## Sequence Generation
Here we take the data and generate sequences from the data

In [0]:
tokenizer = keras_text.Tokenizer(char_level = True)
tokenizer.fit_on_texts(list(list_sentences_train))
# train data
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_t = keras_seq.pad_sequences(list_tokenized_train, maxlen=maxlen)
# test data
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_te = keras_seq.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [0]:
def build_model(conv_layers = 2, 
                dilation_rates = [0, 2, 4, 8, 16], 
                embed_size = 256):
    inp = Input(shape=(None, ))
    x = Embedding(input_dim = len(tokenizer.word_counts)+1, 
                  output_dim = embed_size)(inp)
    prefilt_x = Dropout(0.25)(x)
    out_conv = []
    # dilation rate lets us use ngrams and skip grams to process 
    for dilation_rate in dilation_rates:
        x = prefilt_x
        for i in range(2):
            if dilation_rate>0:
                x = Conv1D(16*2**(i), 
                           kernel_size = 3, 
                           dilation_rate = dilation_rate,
                          activation = 'relu',
                          name = 'ngram_{}_cnn_{}'.format(dilation_rate, i)
                          )(x)
            else:
                x = Conv1D(16*2**(i), 
                           kernel_size = 1,
                          activation = 'relu',
                          name = 'word_fcl_{}'.format(i))(x)
        out_conv += [Dropout(0.5)(GlobalMaxPool1D()(x))]
    x = concatenate(out_conv, axis = -1)    
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model = build_model()
model.summary()

# Train the Model
Here we train the model and use model checkpointing and early stopping to keep only the best version of the model

## Hold-out
We create a hold-out group of data for having a set of data the model was never exposed to for testing it. We add all of the possible categories together as a cheap hack for ensuring groups are somewhat stratified.

In [0]:
from sklearn.model_selection import train_test_split
any_category_positive = np.sum(y,1)
print('Distribution of Total Positive Labels (important for validation)')
print(pd.value_counts(any_category_positive))
X_t_train, X_t_test, y_train, y_test = train_test_split(X_t, y, 
                                                        test_size = 0.2, 
                                                        stratify = any_category_positive,
                                                       random_state = 2017)
print('Training:', X_t_train.shape)
print('Testing:', X_t_test.shape)

In [0]:
batch_size = 128 # large enough that some other labels come in
epochs = 10

file_path="best_weights.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint, early] #early
model.fit(X_t_train, y_train, 
          validation_data=(X_t_test, y_test),
          batch_size=batch_size, 
          epochs=epochs, 
          shuffle = True,
          callbacks=callbacks_list)

# Make Predictions
Load the model and make predictions on the test dataset

In [0]:
model.load_weights(file_path)
y_test = model.predict(X_te)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("predictions.csv", index=False)

In [0]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

In [0]:
!unzip glove.840B.300d.zip

In [0]:
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

In [0]:
!ls

In [0]:
!unzip glove.twitter.27B.zip

In [0]:
!ls

In [3]:
import numpy as np
#np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.layers import Input, Dense, Embedding, MaxPooling1D, Conv1D, SpatialDropout1D
from keras.layers import add, Dropout, PReLU, BatchNormalization, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import optimizers
from keras import initializers, regularizers, constraints, callbacks


import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '8'


#EMBEDDING_FILE = 'glove.twitter.27B.200d.txt'
EMBEDDING_FILE = 'glove.840B.300d.txt'
#EMBEDDING_FILE = 'wiki.simple.vec'

train = pd.read_csv('train_translated_clean.csv')
test = pd.read_csv('test_translated_clean.csv')
submission = pd.read_csv('test_translated_sp_clean.csv')

X_train = train["comment_text_english"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text_english"].fillna("fillna").values


max_features = 160000
maxlen = 300
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)


def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


filter_sizes = [1,2,3,5]
num_filters = 32

def get_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
        
    outp = Dense(6, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model
  


In [4]:
from keras.layers import K, Activation
from keras.engine import Layer
from keras.layers import Dense, Input, Embedding, Dropout, Bidirectional, GRU, Flatten, SpatialDropout1D, CuDNNLSTM
gru_len = 100
Routings = 5
Num_capsule = 10
Dim_capsule = 16
dropout_p = 0.4
rate_drop_dense = 0.35

def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale


# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)


def get_cap_model():
    input1 = Input(shape=(maxlen,))
    embed_layer = Embedding(max_features,
                            embed_size,
                            input_length=maxlen,
                            weights=[embedding_matrix],
                            trainable=False)(input1)
    embed_layer = SpatialDropout1D(rate_drop_dense)(embed_layer)
    
    
    bi = Bidirectional(
        CuDNNLSTM(gru_len, return_sequences=True))(
        embed_layer)
    #bi = Dropout(dropout_p)(bi)
    #gm = GlobalMaxPool1D()(bi)
    #x = Reshape((maxlen, embed_size, 1))(embed_layer)
    
    #conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
    #                                                                                activation='elu')(x)
    #primarycaps = PrimaryCap(x, dim_capsule=4, n_channels=16, kernel_size=8, strides=3, padding='valid')
    capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings,
                      share_weights=True)(bi)
    #att = Attention()(bi)
    #att = Reshape((256, 1))(att)
    # output_capsule = Lambda(lambda x: K.sqrt(K.sum(K.square(x), 2)))(capsule)
    #att = Flatten()(att)
    #att = Dense(16)(att)
    #att = Reshape((1, 16))(att)
    #capsule = Dropout(dropout_p)(capsule)
    #capsule = Reshape((1, 202))(capsule)
    capsule = Flatten()(capsule)
    #conc = concatenate([capsule, gm, att])
    #capsule = Flatten()(capsule)
    capsule = Dropout(dropout_p)(capsule)
    output = Dense(6, activation='sigmoid')(capsule)
    model = Model(inputs=input1, outputs=output)
    adam = optimizers.Adam(lr=0.001)
    model.compile(
        loss='binary_crossentropy',
        optimizer=adam ,
        metrics=['accuracy'])
    model.summary()
    return model

In [0]:
from keras import layers
def PrimaryCap(inputs, dim_capsule, n_channels, kernel_size, strides, padding):
    """
    Apply Conv2D `n_channels` times and concatenate all capsules
    :param inputs: 4D tensor, shape=[None, width, height, channels]
    :param dim_capsule: the dim of the output vector of capsule
    :param n_channels: the number of types of capsules
    :return: output tensor, shape=[None, num_capsule, dim_capsule]
    """
    output = layers.Conv2D(filters=dim_capsule*n_channels, kernel_size=kernel_size, strides=strides, padding=padding,
                           name='primarycap_conv2d')(inputs)
    outputs = layers.Reshape(target_shape=[-1, dim_capsule], name='primarycap_reshape')(output)
    return layers.Lambda(squash, name='primarycap_squash')(outputs)


In [0]:
def dpcnn():
    filter_nr = 64
    filter_size = 3
    max_pool_size = 3
    max_pool_strides = 2
    dense_nr = 256
    spatial_dropout = 0.2
    dense_dropout = 0.5
    train_embed = False
    comment = Input(shape=(maxlen,))
    emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix])(comment)
    emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)
    emb_comment = Bidirectional(CuDNNGRU(64, return_sequences=True))(emb_comment)
    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(emb_comment)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)
    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)

    #we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
    #if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
    resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear')(emb_comment)
    resize_emb = PReLU()(resize_emb)
    
    block1_output = add([block1, resize_emb])
    block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)

    block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1_output)
    block2 = BatchNormalization()(block2)
    block2 = PReLU()(block2)
    block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block2)
    block2 = BatchNormalization()(block2)
    block2 = PReLU()(block2)
    
    block2_output = add([block2, block1_output])
    block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output)

    block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block2_output)
    block3 = BatchNormalization()(block3)
    block3 = PReLU()(block3)
    block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block3)
    block3 = BatchNormalization()(block3)
    block3 = PReLU()(block3)
    
    block3_output = add([block3, block2_output])
    block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output)

    block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block3_output)
    block4 = BatchNormalization()(block4)
    block4 = PReLU()(block4)
    block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block4)
    block4 = BatchNormalization()(block4)
    block4 = PReLU()(block4)

    output = add([block4, block3_output])
    output = GlobalMaxPooling1D()(output)
    output = Dense(dense_nr, activation='linear')(output)
    output = BatchNormalization()(output)
    output = PReLU()(output)
    output = Dropout(dense_dropout)(output)
    output = Dense(6, activation='sigmoid')(output)
    adam = optimizers.Adam(lr=0.0009)
    

    model = Model(comment, output)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model

In [0]:
from keras import backend as K, initializers, regularizers, constraints
from keras.engine.topology import Layer


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:
        
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)
        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

In [22]:
from keras.layers import Permute
def get_gru_model():    
    inp = Input(shape=(maxlen, ))
    embed = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(embed)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Conv1D(60, kernel_size=3, padding='same', activation='linear')(x)
    x = PReLU()(x)
    x = Dropout(0.2)(x)
    mx = GlobalMaxPool1D()(x)
    ax = GlobalAveragePooling1D()(x)
    
    '''
    
    y = SpatialDropout1D(0.2)(embed)
    y = Bidirectional(CuDNNGRU(128, return_sequences=True))(y)
    y = Conv1D(60, kernel_size=3, padding='same', activation='linear')(y)
    y = PReLU()(y)
    y = Dropout(0.2)(y)
    my = GlobalMaxPool1D()(y)
    ay = GlobalAveragePooling1D()(y)
    '''
    #att = Attention()(x)
    conc = concatenate([ax,mx])
    conc = Permute((2, 1))(conc)
    #conc = Flatten()(conc)
    #conc = Dense(64)(conc)
    #conc = Flatten()(conc)
    #conc = Reshape((1, 64))(conc)
    x = CuDNNGRU(64, return_sequences=True)(conc)
    x = Dropout(0.2)(x)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    #adam = optimizers.Adam(lr=0.001)
    adam = optimizers.Nadam(lr=0.0002)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    return model
  

In [9]:
def get_mlp_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    
    x = Flatten()(x)
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dropout(0.4)(x)
    x = Dense(64)(x)
    x = Activation('relu')(x)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    adam = optimizers.Adam(lr=0.0009)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    return model
  

In [5]:
def train_folds(X, y, fold_count, model_list, model_name):
      fold_size = len(X) // fold_count
      models = []
      total_meta = []
      auc_list = []
      for fold_id in range(0, fold_count):
          print("FOLD {}".format(fold_id))
          fold_start = fold_size * fold_id
          fold_end = fold_start + fold_size
            
          if fold_id == fold_count - 1:
              fold_end = len(X)

          train_x = np.concatenate([X[:fold_start], X[fold_end:]])
          train_y = np.concatenate([y[:fold_start], y[fold_end:]])

          val_x = X[fold_start:fold_end]
          val_y = y[fold_start:fold_end]
            
          save_path = os.path.join('models' , '%s_model.h5' % (model_name + str(fold_id)))
          callbacks = [
          ModelCheckpoint(
              save_path, save_best_only=True, verbose=False)
          ]

          model, best_auc = _train_model(model_list[fold_id], train_x, train_y, val_x, val_y,callbacks)
          meta = model.predict(val_x, batch_size=128)
          if (fold_id == 0):
              total_meta = meta
          else:
              total_meta = np.concatenate((total_meta, meta), axis=0)
          model_path = os.path.join('models', "model{0}_weights.npy".format(fold_id))
          np.save(model_path, model.get_weights())
          models.append(model)
          auc_list.append(best_auc)

      return models, total_meta, auc_list

def _train_model(model, train_x, train_y, val_x, val_y, callbacks):
    batch_size = 256
    best_loss = -1
    best_weights = None
    best_epoch = 0
    best_auc = -1
    current_epoch = 0
    #charCNN:LSTM
    #train_x = np.reshape(train_x, train_x.shape + (1,))
    #val_x = np.reshape(val_x, val_x.shape + (1,))
    learning_rate = 0.001
    while True:
        if(current_epoch>0):
            if(current_epoch==9):
                learning_rate = learning_rate * (0.9**current_epoch)
                K.set_value(model.optimizer.lr, learning_rate)
            if(current_epoch==12):
                learning_rate = learning_rate * (0.7**current_epoch)
                K.set_value(model.optimizer.lr, learning_rate)
            if(current_epoch==14):
                learning_rate = learning_rate * (0.7**current_epoch)
                K.set_value(model.optimizer.lr, learning_rate)
        if(current_epoch>14):
            if(current_epoch%3==0):
                learning_rate = learning_rate * (0.7**current_epoch)
                K.set_value(model.optimizer.lr, learning_rate)
        model.fit(
            train_x,
            train_y,
            batch_size=batch_size,
            epochs=1,
            validation_data=(val_x, val_y),
            callbacks=callbacks,
            verbose=2)
        
        y_pred = model.predict(val_x, batch_size=batch_size)

        total_loss = 0
        total_auc = 0
        for j in range(6):
            loss = log_loss(val_y[:, j], y_pred[:, j])
            auc = compute_auc(val_y[:, j], y_pred[:, j])
            total_auc += auc
            total_loss += loss

        total_loss /= 6.
        total_auc /= 6.

        print("Epoch {0} logloss {1} best_logloss {2}, ROC_AUC {3}".format(current_epoch, total_loss, best_loss, total_auc))


        current_epoch += 1
        if total_loss < best_loss or best_loss == -1:
            best_loss = total_loss
            best_auc = total_auc
            best_weights = model.get_weights()
            best_epoch = current_epoch
        else:
            if current_epoch - best_epoch == 3:
                break

    model.set_weights(best_weights)
    return model, best_auc

In [6]:
def compute_auc(y_true, y_pred):
  try:
    return metrics.roc_auc_score(y_true, y_pred)
  except ValueError:
    return np.nan

In [6]:
!mkdir models

In [7]:
list_models = []
folds = 10
for fold in range(0, folds):
    model = get_cap_model()
    list_models.append(model)
    print(fold)


Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 300, 300)          48000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 300, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300, 200)          321600    
_________________________________________________________________
capsule_1 (Capsule)          (None, 10, 16)            32000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
___________________________________________________________

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 300, 300)          48000000  
_________________________________________________________________
spatial_dropout1d_7 (Spatial (None, 300, 300)          0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 300, 200)          321600    
_________________________________________________________________
capsule_7 (Capsule)          (None, 10, 16)            32000     
_________________________________________________________________
flatten_7 (Flatten)          (None, 160)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 160)               0         
__________

In [8]:
from sklearn.metrics import log_loss
from sklearn.metrics import auc
from sklearn import metrics
from keras import backend as K



model_name = 'capsule_2'
models, total_meta, auc_list = train_folds(x_train, y_train, folds, list_models, model_name)

FOLD 0
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 74s - loss: 0.0799 - acc: 0.9764 - val_loss: 0.0495 - val_acc: 0.9819
Epoch 0 logloss 0.0495378013668 best_logloss -1, ROC_AUC 0.959728137038
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 73s - loss: 0.0508 - acc: 0.9813 - val_loss: 0.0446 - val_acc: 0.9832
Epoch 1 logloss 0.044607783214 best_logloss 0.0495378013668, ROC_AUC 0.980213413029
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 71s - loss: 0.0474 - acc: 0.9823 - val_loss: 0.0425 - val_acc: 0.9836
Epoch 2 logloss 0.0425415083447 best_logloss 0.044607783214, ROC_AUC 0.982417590278
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 70s - loss: 0.0451 - acc: 0.9831 - val_loss: 0.0425 - val_acc: 0.9835
Epoch 3 logloss 0.0425309616387 best_logloss 0.0425415083447, ROC_AUC 0.984296917247
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 72s - loss: 0.0434 - acc: 0.9835 - val_loss: 0.0404 - val_acc: 0.984

Epoch 6 logloss 0.0394066876439 best_logloss 0.0397723282781, ROC_AUC 0.989879967923
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 71s - loss: 0.0400 - acc: 0.9844 - val_loss: 0.0388 - val_acc: 0.9843
Epoch 7 logloss 0.038814331639 best_logloss 0.0394066876439, ROC_AUC 0.990150902867
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 73s - loss: 0.0390 - acc: 0.9847 - val_loss: 0.0390 - val_acc: 0.9845
Epoch 8 logloss 0.0389842737298 best_logloss 0.038814331639, ROC_AUC 0.990467128994
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 72s - loss: 0.0370 - acc: 0.9854 - val_loss: 0.0381 - val_acc: 0.9846
Epoch 9 logloss 0.0381065866304 best_logloss 0.038814331639, ROC_AUC 0.990292525538
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 72s - loss: 0.0364 - acc: 0.9855 - val_loss: 0.0384 - val_acc: 0.9847
Epoch 10 logloss 0.0383831184886 best_logloss 0.0381065866304, ROC_AUC 0.99037975392
Train on 143614 samples, validate on 15957

 - 73s - loss: 0.0788 - acc: 0.9764 - val_loss: 0.0477 - val_acc: 0.9828
Epoch 0 logloss 0.0476729884476 best_logloss -1, ROC_AUC 0.963104462711
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 71s - loss: 0.0510 - acc: 0.9814 - val_loss: 0.0443 - val_acc: 0.9837
Epoch 1 logloss 0.044293622651 best_logloss 0.0476729884476, ROC_AUC 0.977690071137
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 72s - loss: 0.0475 - acc: 0.9823 - val_loss: 0.0419 - val_acc: 0.9841
Epoch 2 logloss 0.0419059875459 best_logloss 0.044293622651, ROC_AUC 0.982377570088
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 74s - loss: 0.0449 - acc: 0.9831 - val_loss: 0.0405 - val_acc: 0.9846
Epoch 3 logloss 0.0405127221348 best_logloss 0.0419059875459, ROC_AUC 0.984234832206
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 71s - loss: 0.0433 - acc: 0.9833 - val_loss: 0.0399 - val_acc: 0.9848
Epoch 4 logloss 0.0398745293879 best_logloss 0.0405127221348, ROC_

 - 72s - loss: 0.0360 - acc: 0.9858 - val_loss: 0.0410 - val_acc: 0.9839
Epoch 10 logloss 0.0410038437409 best_logloss 0.0409931199978, ROC_AUC 0.987487488535
FOLD 8
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 72s - loss: 0.0817 - acc: 0.9761 - val_loss: 0.0534 - val_acc: 0.9804
Epoch 0 logloss 0.0534341054068 best_logloss -1, ROC_AUC 0.975144872342
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 71s - loss: 0.0507 - acc: 0.9814 - val_loss: 0.0440 - val_acc: 0.9832
Epoch 1 logloss 0.043992087065 best_logloss 0.0534341054068, ROC_AUC 0.981100222563
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 71s - loss: 0.0473 - acc: 0.9823 - val_loss: 0.0439 - val_acc: 0.9833
Epoch 2 logloss 0.0438561182961 best_logloss 0.043992087065, ROC_AUC 0.984390167501
Train on 143614 samples, validate on 15957 samples
Epoch 1/1
 - 71s - loss: 0.0452 - acc: 0.9829 - val_loss: 0.0410 - val_acc: 0.9842
Epoch 3 logloss 0.0410261343047 best_logloss 0.04385611829

In [9]:
        
print('Model trained!')
print("Predicting results...")

test_predicts_list = []
for fold_id, model in enumerate(models):
    model_path = os.path.join('models', "model{0}_weights.npy".format(fold_id))
    np.save(model_path, model.get_weights())
        
    test_predicts_path = os.path.join('models', "test_predicts{0}.npy".format(fold_id))
    test_predicts = model.predict(x_test, batch_size=256)
    test_predicts_list.append(test_predicts)
    np.save(test_predicts_path, test_predicts)

test_predicts = np.ones(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts *= fold_predict

test_predicts **= (1. / len(test_predicts_list))
test_ids = test["id"].values
test_ids = test_ids.reshape((len(test_ids), 1))
CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
test_predicts["id"] = test_ids
test_predicts = test_predicts[["id"] + CLASSES]
test_predicts.to_csv('ori_pred_cap_lstm_glove.csv', index=False)
print('predicted !')
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
subm = pd.read_csv('train_translated_sp_clean.csv')
submid = pd.DataFrame({'id': subm["id"]})
total_meta_data = pd.concat([submid, pd.DataFrame(total_meta, columns = label_cols)], axis=1)
total_meta_data.to_csv('ori_cap_lstm_glove_meta.csv', index=False)
auc_folds = pd.DataFrame(data=auc_list)
auc_folds.to_csv('auc_ori_cap_lstm_glove.csv', index=False)
print('Meta predicted !')

Model trained!
Predicting results...
predicted !
Meta predicted !


In [0]:
!ls

In [0]:
model = get_model()


batch_size = 256
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)


y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)