In [1]:
import string
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Embedding, Convolution1D, MaxPooling1D
from keras.layers import concatenate, GlobalAveragePooling1D, GlobalMaxPool1D
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers import LSTM, Bidirectional
from keras.regularizers import l2
from keras.utils import to_categorical
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, Callback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_data = pd.read_csv("./train.csv", sep=",")
test_data  = pd.read_csv("./test.csv", sep=",")

In [3]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
labels_ = train_data.iloc[:,2:8].columns.values

In [6]:
num_classes = len(labels_)

In [7]:
x_train, y_train = train_data.comment_text.values, train_data.iloc[:,2:8].values
x_test = test_data.comment_text.values

In [8]:
tokenizer = Tokenizer(lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ")
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [9]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test  = tokenizer.texts_to_sequences(x_test)
vocab_size = 300000
print("{} tokens".format(len(tokenizer.word_index)))

394787 tokens


In [10]:
comment_len = list(map(lambda x: len(x), x_train))
len_counts = Counter(comment_len)
max_seq_len = int(np.percentile(comment_len, 90))

In [11]:
max_seq_len

154

In [12]:
EMBEDDING_DIM = 300
x_train  = pad_sequences(x_train, maxlen=max_seq_len, padding='post', truncating='post', value=0.0)
x_test   = pad_sequences(x_test,  maxlen=max_seq_len, padding='post', truncating='post', value=0.0)

In [13]:
x_train = np.fliplr(x_train)
x_test = np.fliplr(x_test)

print('Shape of data tensor:', x_train.shape)
print('Shape of label tensor:', y_train.shape)

Shape of data tensor: (159571, 154)
Shape of label tensor: (159571, 6)


### Using sigmoid for activation output layer not softmax because softmax when increasing score for one label, all others are lowered which is not suitable for multiple labels.


In [14]:
class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [15]:
model = Sequential()
model.add(Embedding(vocab_size+1,
                    EMBEDDING_DIM,
                    input_length=max_seq_len,
                    trainable=True,
                    name="embedding", embeddings_regularizer=l2(1e-5)))

model.add(Bidirectional(LSTM(512, return_sequences=True, kernel_regularizer=l2(1e-5))))
model.add(Dropout(0.5))
model.add(GlobalMaxPool1D())
model.add(Dense(512, activation="tanh"))
model.add(Dropout(0.5))
model.add(Dense(512, activation="tanh"))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='sigmoid'))

In [16]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 154, 300)          90000300  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 154, 1024)         3330048   
_________________________________________________________________
dropout_1 (Dropout)          (None, 154, 1024)         0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
__________

In [18]:
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False)
checkpoint = ModelCheckpoint('temp.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_acc', min_delta=0.0001, patience=10, verbose=1, mode='auto')

In [19]:
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.05, random_state=233)

In [20]:
model.fit(X_train, 
          Y_train, 
          batch_size=512, 
          epochs=20, 
          verbose=1, 
          validation_data=(X_val,Y_val),
          callbacks = [tensorboard,checkpoint,early_stop,roc_callback(training_data=(X_train,Y_train),validation_data=(X_val, Y_val))],
          shuffle=True)

Train on 151592 samples, validate on 7979 samples
Epoch 1/20
Epoch 00001: val_acc improved from -inf to 0.98032, saving model to temp.h5
roc-auc: 0.9801 - roc-auc_val: 0.9763                                                                                                    
Epoch 2/20
Epoch 00002: val_acc improved from 0.98032 to 0.98212, saving model to temp.h5
roc-auc: 0.9872 - roc-auc_val: 0.98                                                                                                    
Epoch 3/20
Epoch 00003: val_acc did not improve
roc-auc: 0.9919 - roc-auc_val: 0.9814                                                                                                    
Epoch 4/20
Epoch 00004: val_acc did not improve
roc-auc: 0.9932 - roc-auc_val: 0.9841                                                                                                    
Epoch 5/20
Epoch 00005: val_acc did not improve
roc-auc: 0.974 - roc-auc_val: 0.9651                                           

<keras.callbacks.History at 0x7f7aec95ef60>

In [21]:
submission = pd.DataFrame(test_data['id'])
predictions = model.predict(x_test,batch_size=512)
for i in range(len(labels_)):
    submission[labels_[i]] = predictions[:,i]

In [24]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.993966,0.124771,0.971878,0.020371,0.855828,0.040631
1,0000247867823ef7,0.000337,0.000177,0.000225,4.6e-05,0.000272,0.000118
2,00013b17ad220c46,0.00139,0.000495,0.001367,5.7e-05,0.000614,4.6e-05
3,00017563c3f7919a,0.002176,0.000967,0.001209,0.001058,0.000998,8.7e-05
4,00017695ad8997eb,0.00337,0.000851,0.001061,0.000156,0.000417,2.4e-05
