In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from keras.models import Model, load_model
from keras.layers import Dense, Embedding, Input, GRU
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout,GlobalAveragePooling1D,Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from models_def import Attention

max_features = 100000
maxlen = 150

def clean_text( text ):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    #
    return text

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

list_sentences_train = train["comment_text"].fillna("CVxTz").apply(clean_text).values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").apply(clean_text).values
print(y.shape)


tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

print(X_train.shape,X_test.shape)

# check word_index
tmp_cnt = 0
for k in tokenizer.word_index:
    print(k,tokenizer.word_index[k])
    tmp_cnt += 1
    if tmp_cnt >5:
        break
word_idx = tokenizer.word_index

# read word2vec
# 
word_vec_dict = {}
with open('../crawl-300d-2M.vec') as f:
    first_line_flag = True
    for line in f:
        if first_line_flag:
            first_line_flag= False
            continue
        v_list = line.rstrip().split(' ')
        k = str(v_list[0])
        v = np.array([float(x) for x in v_list[1:]])
        word_vec_dict[k] = v
print(len(word_vec_dict))
print('Preparing embedding matrix')


EMBEDDING_DIM = 300
nb_words = min(max_features,len(word_idx))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word,i in word_idx.items():
    if i >= max_features:
        continue
    else:
        if word in word_vec_dict:
            embedding_matrix[i] = word_vec_dict[word]
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
del word_vec_dict

Using TensorFlow backend.


(159571, 6)
(159571, 150) (153164, 150)
hatre 88860
schwartzchild 85345
shoulod 94919
repulsive 18402
niggling 44622
a1dcrwtutq 58358
2000000
Preparing embedding matrix
Null word embeddings: 29069


In [2]:
from sklearn.metrics import log_loss,accuracy_score
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNLSTM
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

def eval_val(y,train_x):
    res = 0
    acc_res = 0
    for i in range(6):
        curr_loss = log_loss(y[:,i],train_x[:,i])
        acc = accuracy_score(y[:,i],train_x[:,i].round())
        print(i,curr_loss,acc)
        res += curr_loss
        acc_res += acc
    print('final',res/6, acc_res/6)

def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
    x = Dropout(0.2)(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    att = Attention(maxlen)(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([att,avg_pool, max_pool])
    x = Dense(256, activation="relu")(conc)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model
print('def model done')

def model done


In [3]:
from sklearn.model_selection import KFold
def kf_train(fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(X_train):
        # x,y
        curr_x,curr_y = X_train[train_index],y[train_index]
        hold_out_x,hold_out_y = X_train[test_index],y[test_index]
        
        # model
        model = get_model()
        batch_size = 64
        epochs = 10
        file_path="weights_base.best.h5"
        checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [checkpoint] 
        
        # train and pred
        model.fit(curr_x, curr_y, 
                  batch_size=batch_size, epochs=epochs, 
                  validation_data=(hold_out_x,hold_out_y), 
                  callbacks=callbacks_list)
        
        model.load_weights(file_path)
        y_test = model.predict(X_test)
        test_pred += y_test
        hold_out_pred = model.predict(hold_out_x)
        train_pred[test_index] = hold_out_pred
    test_pred = test_pred / fold_cnt
    print('-------------------------------')
    print('all eval')
    eval_val(y,train_pred)
    return train_pred, test_pred


print('def done')

def done


In [4]:
import pickle
sample_submission = pd.read_csv("../input/sample_submission.csv")

train_pred,test_pred = kf_train(fold_cnt=4,rnd=4)
print(train_pred.shape,test_pred.shape)    

# 40000,150,lstm + global max_pool
# final 0.0407274256871 0.984048897774

# 100000,150 lstm + attention, glove embedding
# final 0.0404159162853 0.984188856371, pub 9849
# 3996, 4093

# 100000,150 lstm + attention, use spacial dropout,spacial 0.2, last dropout 0.5, fasttext embedding
# 1st epo 4016, 2nd epo 4117, not better compare to glove res

# 100000,150,test arch
#     x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
#     x = Dropout(0.2)(x)
#     x = Bidirectional(LSTM(64, return_sequences=True))(x)
#     x = Attention(maxlen)(x)
#     x = Dense(6, activation="sigmoid")(x)
# 1st epo 4116, not good

# 100000,150,test arch
#     x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
#     x = Dropout(0.2)(x)
#     x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
#     att = Attention(maxlen)(x)
#     avg_pool = GlobalAveragePooling1D()(x)
#     max_pool = GlobalMaxPooling1D()(x)
#     conc = concatenate([att,avg_pool, max_pool])
#     x = Dense(256, activation="relu")(conc)
#     x = Dense(6, activation="sigmoid")(x)
# 1st epo , old LSTM 3945
# to save time ,change to CuDNNLSTM
# 1st epo , 3928, 4 fold: final 0.0393455938053 0.984445795289
# 10 fold: final 0.0391567844913 0.984588887287 PUB 9857


Train on 119678 samples, validate on 39893 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 119678 samples, validate on 39893 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 119678 samples, validate on 39893 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10
Train on 119679 samples, validate on 39892 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
-------------------------------
all eval
0 0.0860980906211 0.966779678012
1 0.021548503393 0.99085673462
2 0.0433285088743 0.982948029404
3 0.00767947049149 0.997386743205
4 0.0580409550316 0.975709872095
5 0.0193780344201 0.992993714397
final 0.0393455938053 0.984445795289
(159571, 6) (153164, 6)


In [5]:
sample_submission[list_classes] = test_pred
sample_submission.to_csv("../results/lstm_attention_fasttext_sample_4.gz", index=False, compression='gzip')
with open('../features/lstm_attention_fasttext_4_feat.pkl','wb') as fout:
    pickle.dump([train_pred,test_pred],fout)
print(sample_submission.head())
print('===================================')

                 id     toxic  severe_toxic   obscene    threat    insult  \
0  00001cee341fdb12  0.991859  3.417225e-01  0.934774  0.123595  0.883666   
1  0000247867823ef7  0.000491  3.515752e-06  0.000112  0.000004  0.000050   
2  00013b17ad220c46  0.000944  9.291653e-06  0.000206  0.000021  0.000140   
3  00017563c3f7919a  0.000116  8.790947e-07  0.000033  0.000004  0.000021   
4  00017695ad8997eb  0.009057  1.704790e-05  0.000801  0.000089  0.000331   

   identity_hate  
0       0.383749  
1       0.000008  
2       0.000072  
3       0.000002  
4       0.000049  


In [6]:
train_pred,test_pred = kf_train(fold_cnt=10,rnd=4)
print(train_pred.shape,test_pred.shape) 
sample_submission[list_classes] = test_pred
sample_submission.to_csv("../results/lstm_attention_fasttext_sample_10.gz", index=False, compression='gzip')
with open('../features/lstm_attention_fasttext_10_feat.pkl','wb') as fout:
    pickle.dump([train_pred,test_pred],fout)
print(sample_submission.head())
print('===================================')

Train on 143613 samples, validate on 15958 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 143614 samples, validate on 15957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
-------------------------------
all eval
0 0.0864134873758 0.966804745223
1 0.0214379711889 0.990537127674
2 0.043058395495 0.983204968321
3 0.00752564715107 0.997468211642
4 0.0575143864776 0.976298951564
5 0.0189908192596 0.9932193193
final 0.0391567844913 0.984588887287
(159571, 6) (153164, 6)
                 id     toxic  severe_toxic   obscene    threat    insult  \
0  00001cee341fdb12  0.992837  4.137306e-01  0.936927  0.164999  0.895366   
1  0000247867823ef7  0.000249  1.016843e-06  0.000046  0.000001  0.000027   
2  00013b17ad220c46  0.000740  1.048361e-05  0.000199  0.000013  0.000102   
3  00017563c3f7919a  0.000172  7.982660e-07  0.000044  0.000009  0.000040   
4  00017695ad8997eb  0.004180  1.450363e-05  0.000314  0.000043  0.000222   

   identity_hate  
0       0.557