In [1]:
# ref: https://www.kaggle.com/jacklinggu/lstm-with-glove-embedding-public-lb-score-0-049

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from keras.models import Model, load_model
from keras.layers import Dense, Embedding, Input, Flatten, MaxPool1D
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout,GlobalAveragePooling1D,Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_features = 40000
maxlen = 150

def clean_text( text ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    #text = BeautifulSoup(review,'html.parser').get_text()
    #
    # 2. Remove non-letters
    text = re.sub("[^A-za-z0-9^,?!.\/'+-=]"," ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    #
    return text.lower()

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

list_sentences_train = train["comment_text"].fillna("CVxTz").apply(clean_text).values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").apply(clean_text).values
print(y.shape)

Using TensorFlow backend.


(95851, 6)


In [2]:
list_sentences_train[:5]

array([ 'nonsense?  kiss off, geek. what i said is true.  i will  have your account terminated.',
       '    please do not vandalize pages, as you did with this edit to w. s. merwin. if you continue to do so, you will be blocked from editing.     ',
       '      points of interest     i removed the   points of interest   section you added because it seemed kind of spammy. i know you probably did not  mean to disobey the rules, but generally, a point of interest tends to be rather touristy, and quite irrelevant to an area culture. that  just my opinion, though.  if you want to reply, just put your reply here and add   talkback jamiegraham08   on my talkpage.    ',
       'asking some his nationality is a racial offence. wow was not  aware of it.  blocking me has shown your support towards your community. thanku for that',
       'the reader here is not going by my say so for ethereal vocal style and dark lyrical content. the cited sources in the external links are saying those things.

In [3]:
print('test len',len(test))

test len 226998


In [4]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

print(X_train.shape,X_test.shape)

(95851, 150) (226998, 150)


In [5]:
# check word_index
tmp_cnt = 0
for k in tokenizer.word_index:
    print(k,tokenizer.word_index[k])
    tmp_cnt += 1
    if tmp_cnt >5:
        break
word_idx = tokenizer.word_index

s9 138179
vse2kwkwh0yc 134170
'pox 78037
asing 135072
atropatne 120328
fractured 27636


In [6]:
# read word2vec
# https://github.com/facebookresearch/MUSE
word_vec_dict = {}
with open('../wiki.multi.en.vec') as f:
    first_line_flag = True
    for line in f:
        if first_line_flag:
            first_line_flag= False
            continue
        v_list = line.split(' ')
        k = str(v_list[0])
        v = np.array([float(x) for x in v_list[1:]])
        word_vec_dict[k] = v
print(len(word_vec_dict))
print(word_vec_dict['is'])
print(word_vec_dict['are'])

200000
[ 0.0137334   0.0554924   0.0455881   0.0301357  -0.0182521   0.0385928
  0.0347148  -0.0847695  -0.036347   -0.00864285  0.00292431  0.0167314
  0.0195147  -0.0372235  -0.014314    0.0173197   0.00033499 -0.0477708
  0.0113374   0.0266912  -0.0615091   0.0665893  -0.125759   -0.069915
 -0.0024989   0.022528    0.00747391  0.0752322  -0.0552592   0.0327767
 -0.0275065   0.144234   -0.130117    0.0105687   0.00473044 -0.0610046
 -0.0559855   0.0619029   0.0353677  -0.0334999  -0.0226966  -0.00395828
 -0.0283532   0.0217597  -0.0418534   0.109104    0.0736382   0.00400721
 -0.0209592  -0.0116593   0.0260413  -0.0188025   0.0445063  -0.0389139
  0.0402938   0.0368409   0.116023    0.0378068   0.0615779   0.0601903
  0.0328234  -0.0483939   0.0331058  -0.00478472 -0.0229684   0.0221889
 -0.0747123   0.0113791   0.0517195  -0.0209997  -0.0373122   0.0159027
  0.0738867  -0.0272447  -0.15535    -0.0287317   0.0143244   0.093508
  0.0261212  -0.0315336  -0.0178931   0.049338    0.03252

In [7]:
print('Preparing embedding matrix')
EMBEDDING_DIM = 300
nb_words = min(max_features,len(word_idx))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word,i in word_idx.items():
    if i >= max_features:
        continue
    else:
        if word in word_vec_dict:
            embedding_matrix[i] = word_vec_dict[word]
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 5293


In [8]:
from sklearn.metrics import log_loss,accuracy_score

def eval_val(y,train_x):
    res = 0
    acc_res = 0
    for i in range(6):
        curr_loss = log_loss(y[:,i],train_x[:,i])
        acc = accuracy_score(y[:,i],train_x[:,i].round())
        print(i,curr_loss,acc)
        res += curr_loss
        acc_res += acc
    print('final',res/6, acc_res/6)

def get_cnn_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
    x = Conv1D(384,
             5,
             padding='valid',
             activation='relu',
             strides=1)(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model
print('def model done')


def model done


In [9]:
tmp_m=get_cnn_model()
tmp_m.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          12000000  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 146, 384)          576384    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 384)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 384)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               98560     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
__________

In [10]:
from sklearn.model_selection import KFold
def kf_train(fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((95851,6)),np.zeros((226998,6))
    for train_index, test_index in kf.split(X_train):
        # x,y
        curr_x,curr_y = X_train[train_index],y[train_index]
        hold_out_x,hold_out_y = X_train[test_index],y[test_index]
        
        # model
        model = get_cnn_model()
        batch_size = 64
        epochs = 10
        file_path="weights_base.best.h5"
        checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [checkpoint] 
        
        # train and pred
        model.fit(curr_x, curr_y, batch_size=batch_size, epochs=epochs, 
                  validation_data=(hold_out_x,hold_out_y), callbacks=callbacks_list)
        model = load_model(file_path)
        y_test = model.predict(X_test)
        test_pred += y_test
        hold_out_pred = model.predict(hold_out_x)
        train_pred[test_index] = hold_out_pred
    test_pred = test_pred / fold_cnt
    print('-------------------------------')
    print('all eval',eval_val(y,train_pred))
    return train_pred, test_pred


train_pred,test_pred = kf_train()
print(train_pred.shape,test_pred.shape)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = test_pred
sample_submission.to_csv("../results/cnn_muse_adj_2_csv.gz", index=False, compression='gzip')
import pickle
with open('../features/cnn_muse_adj_2_feat.pkl','wb') as fout:
    pickle.dump([train_pred,test_pred],fout)
print('done')

Train on 63900 samples, validate on 31951 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 63901 samples, validate on 31950 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 63901 samples, validate on 31950 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10
-------------------------------
0 0.104356182777 0.961753137682
1 0.0236904011323 0.990568695162
2 0.0557392148131 0.979520297128
3 0.0126944686347 0.996828410763
4 0.0682361828897 0.97227989275
5 0.0241961483835 0.992342281249
final 0.0481520997718 0.982215452456
all eval None
(95851, 6) (226998, 6)
done


In [11]:
train_pred[:10].round(3)

array([[ 0.36 ,  0.   ,  0.018,  0.   ,  0.02 ,  0.   ],
       [ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.019,  0.   ,  0.002,  0.   ,  0.002,  0.001],
       [ 0.002,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.314,  0.005,  0.084,  0.006,  0.07 ,  0.007],
       [ 0.005,  0.   ,  0.001,  0.   ,  0.001,  0.   ],
       [ 0.028,  0.   ,  0.004,  0.   ,  0.005,  0.   ],
       [ 0.057,  0.   ,  0.007,  0.   ,  0.01 ,  0.001],
       [ 0.007,  0.   ,  0.002,  0.   ,  0.002,  0.   ]])

In [12]:
y[:10]

array([[1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])