In [1]:
# ref: https://www.kaggle.com/jacklinggu/lstm-with-glove-embedding-public-lb-score-0-049

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from models_def import cnn2d,cnn_v1,cnn_v2,cudnn_gru,lstm_v1,cnn_gru,gru_v1
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import text, sequence

model_defines = [
#     [cnn2d,'cnn2d'],
#     [cnn_v1,'cnn_v1'],
#     [cnn_v2,'cnn_v2'],
    [cudnn_gru,'cudnn_gru'],
    [lstm_v1,'lstm_v1'],
    [cnn_gru,'cnn_gru'],
    [gru_v1,'gru_v1']
]

max_features = 40000
maxlen = 150

def clean_text( text ):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    #
    return text

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

list_sentences_train = train["comment_text"].fillna("CVxTz").apply(clean_text).values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").apply(clean_text).values
print(y.shape)

Using TensorFlow backend.


(159571, 6)


In [2]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

print(X_train.shape,X_test.shape)
EMBEDDING_DIM = 300
nb_words = max_features

(159571, 150) (153164, 150)


In [3]:
from sklearn.metrics import log_loss,accuracy_score
from sklearn.model_selection import KFold
from keras import backend as K
import pickle
from keras.models import load_model
import gc

def eval_val(y,train_x):
    res = 0
    acc_res = 0
    for i in range(6):
        curr_loss = log_loss(y[:,i],train_x[:,i])
        acc = accuracy_score(y[:,i],train_x[:,i].round())
        print(i,curr_loss,acc)
        res += curr_loss
        acc_res += acc
    print('final',res/6, acc_res/6)

def kf_train(model_func,fold_cnt=3,rnd=1,epo=10,batch=64):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(X_train):
        # x,y
        curr_x,curr_y = X_train[train_index],y[train_index]
        hold_out_x,hold_out_y = X_train[test_index],y[test_index]
        
        # model
        model = model_func(maxlen,nb_words,EMBEDDING_DIM,embedding_matrix=None,trainable_flag=True,comp=True)
        batch_size = batch
        epochs = epo
        file_path="best_model.h5"
        checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [checkpoint] 
        
        # train and pred
        model.fit(curr_x, curr_y, batch_size=batch_size, epochs=epochs, 
                  validation_data=(hold_out_x,hold_out_y), callbacks=callbacks_list)
        model = load_model(file_path)
        y_test = model.predict(X_test)
        test_pred += y_test
        hold_out_pred = model.predict(hold_out_x)
        train_pred[test_index] = hold_out_pred
        
        # clear
        del model
        gc.collect()
        K.clear_session()
        
    test_pred = test_pred / fold_cnt
    print('-------------------------------')
    eval_val(y,train_pred)
    return train_pred, test_pred

rnd_init = 1
for model_func,model_name in model_defines:
    for fold_k in [3,4]:
        print('==================================')
        if 'cnn' in model_name:
            epo = 5
            batch = 64
        else:
            epo = 6
            batch = 256
        train_pred,test_pred = kf_train(model_func,fold_k,rnd_init,epo,batch)
        output_feat = '../features/no_pretrained_{}_{}_feat.pkl'.format(model_name,fold_k)
        with open(output_feat,'wb') as fout:
            pickle.dump([train_pred,test_pred],fout)
        print(output_feat,'done')
        rnd_init += 1
        




Train on 106380 samples, validate on 53191 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 106381 samples, validate on 53190 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 106381 samples, validate on 53190 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
-------------------------------
0 0.102385479284 0.962355315189
1 0.0229093525351 0.990392991208
2 0.0511613450993 0.980804782824
3 0.0139166059692 0.99700446823
4 0.0678365859536 0.972363399365
5 0.0306125790283 0.991195141974
final 0.0481369913117 0.982352683132
../features/no_pretrained_cudnn_gru_3_feat.pkl done
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6


Epoch 5/6
Epoch 6/6
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 119679 samples, validate on 39892 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
-------------------------------
0 0.102574031529 0.962919327447
1 0.0228500144971 0.990505793659
2 0.0494022846221 0.981556799168
3 0.01392865333 0.99700446823
4 0.0663289321898 0.973071548088
5 0.0304034844566 0.991195141974
final 0.0475812334374 0.982708846428
../features/no_pretrained_cudnn_gru_4_feat.pkl done
Train on 106380 samples, validate on 53191 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 106381 samples, validate on 53190 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 106381 samples, validate on 53190 samples
Epoch 1/6


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
-------------------------------
0 0.101539365582 0.962844125812
1 0.0231423332503 0.990292722362
2 0.0503876554008 0.980560377512
3 0.0141010938138 0.99700446823
4 0.0671565382176 0.97198112439
5 0.0315238226833 0.991195141974
final 0.0479751348247 0.98231299338
../features/no_pretrained_lstm_v1_3_feat.pkl done
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 119679 samples, validate on 39892 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6


Epoch 5/6
Epoch 6/6
-------------------------------
0 0.100630069845 0.96358360855
1 0.023183395901 0.990167386305
2 0.0506195456627 0.980403707441
3 0.0141796950442 0.99700446823
4 0.0669499699278 0.972012458404
5 0.0312726303127 0.991195141974
final 0.047805884449 0.982394461817
../features/no_pretrained_lstm_v1_4_feat.pkl done
Train on 106380 samples, validate on 53191 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 106381 samples, validate on 53190 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 106381 samples, validate on 53190 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
-------------------------------
0 0.104522601412 0.961534364014
1 0.0229163695108 0.990512060462
2 0.0509606443892 0.980961452896
3 0.0119681801337 0.997035802245
4 0.0654148907397 0.973898766067
5 0.026652195888 0.991514748921
final 0.0470724803456 0.982576199101
../features/no_pretrained_cnn_gru_3_feat.pkl done
Train on 119678 samples, validate on 39893 samples

Epoch 5/5
Train on 119678 samples, validate on 39893 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 119679 samples, validate on 39892 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
-------------------------------
0 0.103274603952 0.962154777497
1 0.0224676550855 0.990524594068
2 0.0496841468694 0.981149456982
3 0.0119107802836 0.997023268639
4 0.0648627990668 0.973660627558
5 0.027013414237 0.991608750964
final 0.0465355665823 0.982686912618
../features/no_pretrained_cnn_gru_4_feat.pkl done
Train on 106380 samples, validate on 53191 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 106381 samples, validate on 53190 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 106381 samples, validate on 53190 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6


Epoch 6/6
-------------------------------
0 0.103449126208 0.961910372185
1 0.0234551035594 0.990098451473
2 0.0516653076708 0.979996365254
3 0.0139751198343 0.99700446823
4 0.0673542519272 0.972225529702
5 0.0311113939982 0.991195141974
final 0.0485017171997 0.98207172147
../features/no_pretrained_gru_v1_3_feat.pkl done
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 119678 samples, validate on 39893 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 119679 samples, validate on 39892 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
-------------------------------
0 0.103410007806 0.961966773411
1 0.0231076128803 0.990286455559
2 0.0503003617418 0.98068571357
3 0.0140662208251 0.99700446823
4 0.067009976939 0.972488735422
5 0.0310803986402 0.991195141974
final 0