In [1]:
import pickle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string
from keras.models import Model, load_model
from keras.layers import Dense, Embedding, Input, GRU
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout,GlobalAveragePooling1D,Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from models_def import Attention

# prepare other feat
fl = [
    '../features/other_feat.pkl',
    '../features/lgb1_feat.pkl',
    '../features/rf1_feat.pkl',
    '../features/gbrt1_feat.pkl',
    '../features/lr_feat1.pkl',
    '../features/lr_feat2.pkl',
    '../features/ridge_feat1.pkl',
    '../features/ridge_feat2.pkl',
    '../features/mnb_feat1.pkl',
    '../features/mnb_feat2.pkl',
    '../features/wordbatch_feat.pkl',
    '../features/tilli_lr_feat.pkl',

]
def get_feat(f):
    with open(f,'rb') as fin:
        a,b = pickle.load(fin)
        return a,b

# load feats
train_x,test_x = [],[]
for feat in fl:
    print('file path',feat)
    a,b = pickle.load(open(feat,'rb'))
    print(a.shape,b.shape)
    train_x.append(a)
    test_x.append(b)
train_x = np.nan_to_num(np.hstack(train_x))
test_x = np.nan_to_num(np.hstack(test_x))
print(train_x.shape)
print(train_x[0])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
print(train_x[0])

max_features = 160000
maxlen = 250

# Contraction replacement patterns
cont_patterns = [
    (b'(W|w)on\'t', b'will not'),
    (b'(C|c)an\'t', b'can not'),
    (b'(I|i)\'m', b'i am'),
    (b'(A|a)in\'t', b'is not'),
    (b'(\w+)\'ll', b'\g<1> will'),
    (b'(\w+)n\'t', b'\g<1> not'),
    (b'(\w+)\'ve', b'\g<1> have'),
    (b'(\w+)\'s', b'\g<1> is'),
    (b'(\w+)\'re', b'\g<1> are'),
    (b'(\w+)\'d', b'\g<1> would'),
    (b'&lt;3', b' heart '),
    (b':d', b' smile '),
    (b':dd', b' smile '),
    (b':p', b' smile '),
    (b'8\)', b' smile '),
    (b':-\)', b' smile '),
    (b':\)', b' smile '),
    (b';\)', b' smile '),
    (b'\(-:', b' smile '),
    (b'\(:', b' smile '),
    (b'yay!', b' good '),
    (b'yay', b' good '),
    (b'yaay', b' good '),
    (b':/', b' worry '),
    (b':&gt;', b' angry '),
    (b":'\)", b' sad '),
    (b':-\(', b' sad '),
    (b':\(', b' sad '),
    (b':s', b' sad '),
    (b':-s', b' sad '),
    (b'\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}', b' '),
    (b'(\[[\s\S]*\])', b' '),
    (b'[\s]*?(www.[\S]*)', b' ')
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]



def new_clean(text):
    """ Simple text clean up process"""
    # 1. Go to lower case (only good for english)
    # Go to bytes_strings as I had issues removing all \n in r""
    clean = bytes(text.lower(), encoding="utf-8")
    
    # replace words like hhhhhhhhhhhhhhi with hi
    for ch in string.ascii_lowercase:
        pattern = bytes(ch+'{3,}', encoding="utf-8")
        clean = re.sub(pattern, bytes(ch, encoding="utf-8"), clean)
    # 2. Drop \n and  \t
    clean = clean.replace(b"\n", b" ")
    clean = clean.replace(b"\t", b" ")
    clean = clean.replace(b"\b", b" ")
    clean = clean.replace(b"\r", b" ")
    # 3. Replace english contractions
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    # 4. Drop puntuation
    # I could have used regex package with regex.sub(b"\p{P}", " ")
    exclude = re.compile(b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
    clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
    # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-)
    clean = re.sub(b"\d+", b" ", clean)
    # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences
    clean = re.sub(b'\s+', b' ', clean)
    # Remove ending space if any
    clean = re.sub(b'\s+$', b'', clean)
    # 7. Now replace words by words surrounded by # signs
    # e.g. my name is bond would become #my# #name# #is# #bond#
    # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
    clean = re.sub(b" ", b"# #", clean)  # Replace space
    clean = b"#" + clean + b"#"  # add leading and trailing #

    return str(clean, 'utf-8')

def clean_text( text ):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return new_clean(text)

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

list_sentences_train = train["comment_text"].fillna("CVxTz").apply(clean_text).values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").apply(clean_text).values
print(y.shape)


tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

print(X_train.shape,X_test.shape)

# check word_index
tmp_cnt = 0
for k in tokenizer.word_index:
    print(k,tokenizer.word_index[k])
    tmp_cnt += 1
    if tmp_cnt >5:
        break
word_idx = tokenizer.word_index

# read word2vec
# 
word_vec_dict = {}
with open('../crawl-300d-2M.vec') as f:
    first_line_flag = True
    for line in f:
        if first_line_flag:
            first_line_flag= False
            continue
        v_list = line.rstrip().split(' ')
        k = str(v_list[0])
        v = np.array([float(x) for x in v_list[1:]])
        word_vec_dict[k] = v
print(len(word_vec_dict))
print('Preparing embedding matrix')

EMBEDDING_DIM = 300
nb_words = min(max_features,len(word_idx))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word,i in word_idx.items():
    if i >= max_features:
        continue
    else:
        if word in word_vec_dict:
            embedding_matrix[i] = word_vec_dict[word]
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
del word_vec_dict

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


file path ../features/other_feat.pkl
(159571, 37) (153164, 37)
file path ../features/lgb1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/rf1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/gbrt1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/ridge_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/ridge_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/mnb_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/mnb_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/wordbatch_feat.pkl
(159571, 6) (153164, 6)
file path ../features/tilli_lr_feat.pkl
(159571, 6) (153164, 6)
(159571, 103)
[ 5.00000000e+01  4.60000000e+01  2.64000000e+02  0.00000000e+00
  2.00000000e+01  0.00000000e+00  3.00000000e+00  1.20000000e+01
  4.24000000e+00  0.00000000e+00  6.43939394e-02  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000

In [2]:
from sklearn.metrics import log_loss,accuracy_score
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNLSTM
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

def eval_val(y,train_x):
    res = 0
    acc_res = 0
    for i in range(6):
        curr_loss = log_loss(y[:,i],train_x[:,i])
        acc = accuracy_score(y[:,i],train_x[:,i].round())
        print(i,curr_loss,acc)
        res += curr_loss
        acc_res += acc
    print('final',res/6, acc_res/6)

def get_model(comp):
    inp = Input(shape=(maxlen, ))
    inp_2 = Input(shape=[train_x.shape[1]], name="other")
    emb = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
    emb = SpatialDropout1D(0.4)(emb)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(emb)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    att = Attention(maxlen)(x)
    conc = concatenate([att, avg_pool, max_pool, inp_2])
    conc = Dense(256, activation="relu")(conc)
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=[inp,inp_2], outputs=outp)
    if comp:
        model.compile(loss='binary_crossentropy',
                      optimizer='nadam',
                      metrics=['accuracy'])

    return model
print('def model done')

def model done


In [7]:
from sklearn.model_selection import KFold
import gc
from keras import backend as K

def kf_train(fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(X_train):
        # x,y
        curr_x,curr_y = X_train[train_index],y[train_index]
        curr_other_x = train_x[train_index]
        hold_out_x,hold_out_y = X_train[test_index],y[test_index]
        hold_out_other_x = train_x[test_index]
        
        # model
        model = get_model(True)
        batch_size = 64
        epochs = 6
        file_path="weights_base.best.h5"
        checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [checkpoint] 
        
        # train and pred
        model.fit([curr_x,curr_other_x], curr_y, 
                  batch_size=batch_size, epochs=epochs, 
                  validation_data=([hold_out_x,hold_out_other_x],hold_out_y), 
                  callbacks=callbacks_list)
        
        model.load_weights(file_path)
        y_test = model.predict([X_test,test_x])
        test_pred += y_test
        hold_out_pred = model.predict([hold_out_x,hold_out_other_x])
        train_pred[test_index] = hold_out_pred
        
        # clear
        del model
        gc.collect()
        K.clear_session()
    test_pred = test_pred / fold_cnt
    print('-------------------------------')
    print('all eval',eval_val(y,train_pred))
    return train_pred, test_pred


print('def done')

def done


In [8]:
import pickle
sample_submission = pd.read_csv("../input/sample_submission.csv")

train_pred,test_pred = kf_train(fold_cnt=5,rnd=42)
print(train_pred.shape,test_pred.shape)    

# 40000,150,lstm + global max_pool
# final 0.0407274256871 0.984048897774

# 100000,150 lstm + attention, glove embedding
# final 0.0404159162853 0.984188856371, pub 9849
# 3996, 4093

# 100000,150 lstm + attention, use spacial dropout,spacial 0.2, last dropout 0.5, fasttext embedding
# 1st epo 4016, 2nd epo 4117, not better compare to glove res

# 100000,150,test arch
#     x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
#     x = Dropout(0.2)(x)
#     x = Bidirectional(LSTM(64, return_sequences=True))(x)
#     x = Attention(maxlen)(x)
#     x = Dense(6, activation="sigmoid")(x)
# 1st epo 4116, not good

# 100000,150,test arch
#     x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
#     x = Dropout(0.2)(x)
#     x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
#     att = Attention(maxlen)(x)
#     avg_pool = GlobalAveragePooling1D()(x)
#     max_pool = GlobalMaxPooling1D()(x)
#     conc = concatenate([att,avg_pool, max_pool])
#     x = Dense(256, activation="relu")(conc)
#     x = Dense(6, activation="sigmoid")(x)
# 1st epo , old LSTM 3945
# to save time ,change to CuDNNLSTM
# 1st epo , 3928, 4 fold: final 0.0393455938053 0.984445795289
# 10 fold: final 0.0391567844913 0.984588887287 PUB 9857

# new adj
# 5 fold: final 0.037041229455724294 0.985068297706559 PUB 9862

Train on 127656 samples, validate on 31915 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 127657 samples, validate on 31914 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 127657 samples, validate on 31914 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 127657 samples, validate on 31914 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 127657 samples, validate on 31914 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6


Epoch 4/6
Epoch 5/6
Epoch 6/6
-------------------------------
0 0.07931138260284377 0.9689605254087522
1 0.020891926770841427 0.9909507366626769
2 0.040113176926091586 0.9834932412531099
3 0.0076026471764150585 0.9973115415708368
4 0.055595746264591975 0.9766812265386567
5 0.018732496993561963 0.9930125148053218
final 0.037041229455724294 0.985068297706559
all eval None
(159571, 6) (153164, 6)


In [9]:
sample_submission[list_classes] = test_pred
sample_submission.to_csv("../results/lstm_attention_fasttext_sample_5.gz", index=False, compression='gzip')
with open('../features/lstm_attention_fasttext_5_feat.pkl','wb') as fout:
    pickle.dump([train_pred,test_pred],fout)
print(sample_submission.head())
print('===================================')

                 id     toxic  severe_toxic   obscene        threat    insult  \
0  00001cee341fdb12  0.999125  3.895919e-01  0.971504  2.315079e-01  0.932501   
1  0000247867823ef7  0.000086  4.912789e-08  0.000006  1.349892e-06  0.000005   
2  00013b17ad220c46  0.000011  3.925341e-09  0.000002  2.898996e-07  0.000001   
3  00017563c3f7919a  0.000061  1.205650e-07  0.000006  1.818519e-05  0.000012   
4  00017695ad8997eb  0.000678  1.680068e-07  0.000024  6.937993e-06  0.000020   

   identity_hate  
0   5.447136e-01  
1   5.466332e-07  
2   4.879677e-07  
3   1.390482e-06  
4   2.878596e-06  


In [10]:
train_pred,test_pred = kf_train(fold_cnt=10,rnd=42)
print(train_pred.shape,test_pred.shape) 
sample_submission[list_classes] = test_pred
sample_submission.to_csv("../results/lstm_attention_fasttext_sample_10.gz", index=False, compression='gzip')
with open('../features/lstm_attention_fasttext_10_feat.pkl','wb') as fout:
    pickle.dump([train_pred,test_pred],fout)
print(sample_submission.head())
print('===================================')

# final 0.03672656107307943 0.9850442749622426 PUB 9863

Train on 143613 samples, validate on 15958 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6


Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Train on 143614 samples, validate on 15957 samples
Epoch 1/6


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
-------------------------------
0 0.07845973429999673 0.9688414561543137
1 0.02069356194870345 0.9907376653652606
2 0.04001948253566198 0.9833804388015366
3 0.007756079238468699 0.9972739407536457
4 0.054908620361319854 0.9767689617787694
5 0.018521888054325884 0.9932631869199291
final 0.03672656107307943 0.9850442749622426
all eval None
(159571, 6) (153164, 6)
                 id     toxic  severe_toxic   obscene        threat    insult  \
0  00001cee341fdb12  0.998661  3.786862e-01  0.957138  1.786500e-01  0.912598   
1  0000247867823ef7  0.000191  1.732031e-08  0.000011  1.745301e-06  0.000009   
2  00013b17ad220c46  0.000019  3.861570e-10  0.000008  1.205197e-07  0.000001   
3  00017563c3f7919a  0.000094  1.363162e-08  0.000008  1.649089e-05  0.000011   
4  00017695ad8997eb  0.000731  3.728374e-08  0.000039  9.583951e-06  0.000029   

   identity_hate  
0   4.857816e-01  
1   2.278547e-06  
2   2.891617e-07  
3   2.257141e-06  
4   