In [1]:
# ref: https://www.kaggle.com/jacklinggu/lstm-with-glove-embedding-public-lb-score-0-049

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from keras.models import Model, load_model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout,GlobalAveragePooling1D,Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_features = 40000
maxlen = 150

def clean_text( text ):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    #
    return text

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

list_sentences_train = train["comment_text"].fillna("CVxTz").apply(clean_text).values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").apply(clean_text).values
print(y.shape)

Using TensorFlow backend.


(159571, 6)


In [2]:
list_sentences_train[:5]

array([ 'explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now 89 205 38 27',
       'd aww ! he matches this background colour i am seemingly stuck with thanks talk 21 51 january 11 2016 utc ',
       'hey man i am really not trying to edit war it just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info ',
       ' more i cannot make any real suggestions on improvement - i wondered if the section statistics should be later on or a subsection of types of accidents - i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if no - one else does first - if you have any preferences for formatting styl

In [3]:
print('test len',len(test))

test len 153164


In [4]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

print(X_train.shape,X_test.shape)

(159571, 150) (153164, 150)


In [5]:
# check word_index
tmp_cnt = 0
for k in tokenizer.word_index:
    print(k,tokenizer.word_index[k])
    tmp_cnt += 1
    if tmp_cnt >5:
        break
word_idx = tokenizer.word_index

anyhows 146576
sdl 131088
teutones 130857
9575 147566
disaffiliating 90938
substatiantion 170646


In [7]:
# read word2vec
# 
word_vec_dict = {}
with open('../glove.840B.300d.txt') as f:
    for line in f:
        v_list = line.strip().split(' ')
        k = str(v_list[0])
        v = np.array([float(x) for x in v_list[1:]])
        word_vec_dict[k] = v
print(len(word_vec_dict))
# print(word_vec_dict['is'])
# print(word_vec_dict['are'])

print('Preparing embedding matrix')
EMBEDDING_DIM = 300
nb_words = min(max_features,len(word_idx))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word,i in word_idx.items():
    if i >= max_features:
        continue
    else:
        if word in word_vec_dict:
            embedding_matrix[i] = word_vec_dict[word]
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
del word_vec_dict

2196007
Preparing embedding matrix
Null word embeddings: 3388


In [8]:
from sklearn.metrics import log_loss,accuracy_score
from keras.layers import Bidirectional, Dropout, CuDNNGRU

def eval_val(y,train_x):
    res = 0
    acc_res = 0
    for i in range(6):
        curr_loss = log_loss(y[:,i],train_x[:,i])
        acc = accuracy_score(y[:,i],train_x[:,i].round())
        print(i,curr_loss,acc)
        res += curr_loss
        acc_res += acc
    print('final',res/6, acc_res/6)

# https://github.com/PavelOstyakov/toxic/blob/master/toxic/model.py
def get_model(comp=True):
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Dropout(0.1)(x)
    x = Bidirectional(CuDNNGRU(128, return_sequences=False))(x)
    x = Dense(32, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    if comp:
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

    return model


print('def model done')

def model done


In [9]:
tmp_m = get_model(False)
tmp_m.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          12000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 150, 256)          330240    
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 256)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               296448    
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
__________

In [10]:
from sklearn.utils import shuffle

def del_data_ratio(x,y,ratio=0.8):
    print(x.shape)
    pos_index = np.where(y[:,0]==1)[0]
    neg_index = np.where(y[:,0]==0)[0]
    print(pos_index)
    data_cnt = len(pos_index)
    add_cnt = int(data_cnt*ratio)
    add_index = pos_index[:add_cnt]
    add_x = np.concatenate([x[add_index],x[neg_index]])
    add_y = np.concatenate([y[add_index],y[neg_index]])
    print(add_x.shape,data_cnt)
    add_x,add_y = shuffle(add_x,add_y,random_state=666)
    return add_x,add_y

from sklearn.model_selection import KFold
def kf_train(fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(X_train):
        # x,y
        curr_x,curr_y = X_train[train_index],y[train_index]
        hold_out_x,hold_out_y = X_train[test_index],y[test_index]
        
        # change pos ratio
        new_curr_x,new_curr_y = del_data_ratio(curr_x,curr_y)
        new_hold_x,new_hold_y = del_data_ratio(hold_out_x,hold_out_y)
        
        # model
        model = get_model()
        batch_size = 256
        epochs = 6
        file_path="weights_base.best.h5"
        checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [checkpoint] 
        
        # train and pred
        model.fit(new_curr_x, new_curr_y, 
                  batch_size=batch_size, epochs=epochs, 
                  validation_data=(new_hold_x,new_hold_y), 
                  callbacks=callbacks_list)
        
        model = load_model(file_path)
        y_test = model.predict(X_test)
        test_pred += y_test
        hold_out_pred = model.predict(hold_out_x)
        train_pred[test_index] = hold_out_pred
    test_pred = test_pred / fold_cnt
    print('-------------------------------')
    print('all eval',eval_val(y,train_pred))
    return train_pred, test_pred


train_pred,test_pred = kf_train(fold_cnt=5)

(127656, 150)
[    10     13     23 ..., 127626 127631 127639]
(125221, 150) 12171
(31915, 150)
[    6    12    16 ..., 31893 31901 31906]
(31290, 150) 3123
Train on 125221 samples, validate on 31290 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
(127657, 150)
[     6     12     16 ..., 127627 127632 127640]
(125211, 150) 12229
(31914, 150)
[   10    13    23 ..., 31894 31907 31908]
(31301, 150) 3065
Train on 125211 samples, validate on 31301 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
(127657, 150)
[     6     12     16 ..., 127627 127632 127640]
(125203, 150) 12267
(31914, 150)
[    7    17    26 ..., 31903 31904 31912]
(31308, 150) 3027
Train on 125203 samples, validate on 31308 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
(127657, 150)
[     6     12     16 ..., 127627 127632 127640]
(125206, 150) 12252
(31914, 150)
[   10    23    58 ..., 31886 31906 31911]
(31305, 150) 3042
Train on 125206 samples, validate on 31

Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
-------------------------------
0 0.0905073830327 0.965833390779
1 0.0219169597443 0.990668730534
2 0.0461252664947 0.98210827782
3 0.00896275124672 0.997280207557
4 0.0597781160487 0.975296263105
5 0.0202313809194 0.992736775479
final 0.0412536429144 0.983987274212
all eval None


In [11]:
print(train_pred.shape,test_pred.shape)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = test_pred
sample_submission.to_csv("../results/cudnn_gru_glove_1_csv_sample.gz", index=False, compression='gzip')
import pickle
with open('../features/cudnn_gru_glove_1_sample_feat.pkl','wb') as fout:
    pickle.dump([train_pred,test_pred],fout)
sample_submission.head()


(159571, 6) (153164, 6)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.994163,0.3585157,0.943492,0.113758,0.886089,0.353726
1,0000247867823ef7,0.000202,1.994826e-07,3.1e-05,2e-06,1.4e-05,6e-06
2,00013b17ad220c46,0.000315,8.70925e-07,0.00013,7e-06,3.6e-05,1.2e-05
3,00017563c3f7919a,0.000297,3.256011e-07,3.4e-05,5e-06,2.7e-05,5e-06
4,00017695ad8997eb,0.004468,4.849345e-06,0.000465,3.5e-05,0.000293,4e-05


In [12]:
sample_submission[list_classes] = test_pred/1.2
sample_submission.to_csv("../results/cudnn_gru_glove_1_csv_sample_div2.gz", index=False, compression='gzip')
sample_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.828469,0.2987631,0.786243,0.094798,0.738407,0.294772
1,0000247867823ef7,0.000168,1.662355e-07,2.6e-05,2e-06,1.2e-05,5e-06
2,00013b17ad220c46,0.000262,7.257708e-07,0.000109,6e-06,3e-05,1e-05
3,00017563c3f7919a,0.000247,2.713343e-07,2.8e-05,4e-06,2.3e-05,4e-06
4,00017695ad8997eb,0.003723,4.041121e-06,0.000388,2.9e-05,0.000244,3.3e-05
