In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from keras.models import Model, load_model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout,GlobalAveragePooling1D,Conv1D,Conv2D,Reshape
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_features = 40000
maxlen = 150

def clean_text( text ):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+\-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    #
    return text

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

list_sentences_train = train["comment_text"].fillna("CVxTz").apply(clean_text).values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").apply(clean_text).values
print(y.shape)

Using TensorFlow backend.


(159571, 6)


In [2]:
list_sentences_train[:5]

array([ 'explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now 89 205 38 27',
       'd aww ! he matches this background colour i am seemingly stuck with thanks talk 21 51 january 11 2016 utc ',
       'hey man i am really not trying to edit war it just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info ',
       ' more i cannot make any real suggestions on improvement - i wondered if the section statistics should be later on or a subsection of types of accidents - i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if no - one else does first - if you have any preferences for formatting styl

In [3]:
print('test len',len(test))

test len 153164


In [4]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

print(X_train.shape,X_test.shape)

(159571, 150) (153164, 150)


In [5]:
# check word_index
tmp_cnt = 0
for k in tokenizer.word_index:
    print(k,tokenizer.word_index[k])
    tmp_cnt += 1
    if tmp_cnt >5:
        break
word_idx = tokenizer.word_index

jaszag 135029
ocbvious 142057
slabs 44626
205m 142987
981e 88058
diaoyutai 117101


In [6]:
# read word2vec
# https://github.com/facebookresearch/MUSE
word_vec_dict = {}
with open('../wiki.multi.en.vec') as f:
    first_line_flag = True
    for line in f:
        if first_line_flag:
            first_line_flag= False
            continue
        v_list = line.split(' ')
        k = str(v_list[0])
        v = np.array([float(x) for x in v_list[1:]])
        word_vec_dict[k] = v
print(len(word_vec_dict))


200000


In [7]:
print('Preparing embedding matrix')
EMBEDDING_DIM = 300
nb_words = min(max_features,len(word_idx))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word,i in word_idx.items():
    if i >= max_features:
        continue
    else:
        if word in word_vec_dict:
            embedding_matrix[i] = word_vec_dict[word]
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 4273


In [8]:
from sklearn.metrics import log_loss,accuracy_score
from keras.layers import MaxPool2D,concatenate,Flatten

def eval_val(y,train_x):
    res = 0
    acc_res = 0
    for i in range(6):
        curr_loss = log_loss(y[:,i],train_x[:,i])
        acc = accuracy_score(y[:,i],train_x[:,i].round())
        print(i,curr_loss,acc)
        res += curr_loss
        acc_res += acc
    print('final',res/6, acc_res/6)

def get_cnn_model(comp=True):
    # https://github.com/bhaveshoswal/CNN-text-classification-keras/blob/master/model.py
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix],trainable=False)(inp)
    x = Reshape((maxlen,EMBEDDING_DIM,1))(x)
    x = Dropout(0.2)(x)
   
    x1 = Conv2D(128,kernel_size=(3,EMBEDDING_DIM),activation='relu')(x)
    x1 = MaxPool2D(pool_size=(maxlen - 3 + 1, 1), strides=(1,1), padding='valid')(x1)
    
    x2 = Conv2D(128,kernel_size=(5,EMBEDDING_DIM),activation='relu')(x)
    x2 = MaxPool2D(pool_size=(maxlen - 5 + 1, 1), strides=(1,1), padding='valid')(x2)
    
    x3 = Conv2D(128,kernel_size=(7,EMBEDDING_DIM),activation='relu')(x)
    x3 = MaxPool2D(pool_size=(maxlen - 7 + 1, 1), strides=(1,1), padding='valid')(x3)
    
    x = concatenate([x1,x2,x3])
    x = Flatten()(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    if comp:
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

    return model
print('def model done')

tmp_m = get_cnn_model()
tmp_m.summary()

def model done
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     12000000    input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 150, 300, 1)  0           embedding_1[0][0]                
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 150, 300, 1)  0           reshape_1[0][0]                  
______________________________________________________________________________________________

In [9]:
def data_gen(x_data,y_data,batch_size=64):
    data_cnt = len(y_data)
    curr_idx = 0
    while True:
        if curr_idx+batch_size>=data_cnt:
            start_idx,end_idx = data_cnt-batch_size,data_cnt
            curr_idx = 0
        else:
            start_idx,end_idx = curr_idx,curr_idx+batch_size
            curr_idx += batch_size
            
        curr_x = x_data[start_idx:end_idx]
        curr_y = y_data[start_idx:end_idx]
        yield curr_x,curr_y

In [11]:
from sklearn.model_selection import KFold
def kf_train(fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(X_train):
        # x,y
        curr_x,curr_y = X_train[train_index],y[train_index]
        hold_out_x,hold_out_y = X_train[test_index],y[test_index]
        train_gen = data_gen(curr_x,curr_y)
        
        # model
        model = get_cnn_model()
        batch_size = 64
        epochs = 20
        file_path="weights_base.best.h5"
        checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [checkpoint] 
        
        # train and pred
        model.fit_generator(train_gen, 
                            steps_per_epoch=400, 
                            epochs=epochs, 
                            validation_data=(hold_out_x,hold_out_y), 
                            callbacks=callbacks_list)
        model = load_model(file_path)
        y_test = model.predict(X_test)
        test_pred += y_test
        hold_out_pred = model.predict(hold_out_x)
        train_pred[test_index] = hold_out_pred
    test_pred = test_pred / fold_cnt
    print('-------------------------------')
    print('all eval',eval_val(y,train_pred))
    return train_pred, test_pred


train_pred,test_pred = kf_train()
print(train_pred.shape,test_pred.shape)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = test_pred
sample_submission.to_csv("../results/cnn2d_muse_1_csv.gz", index=False, compression='gzip')
import pickle
with open('../features/cnn2d_muse_1_feat.pkl','wb') as fout:
    pickle.dump([train_pred,test_pred],fout)
print('done')

# pre cnn 4551, 4565, 4717
# cnn2d, with data gen
# 4685, 4647,4865

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


Epoch 19/20
Epoch 20/20
-------------------------------
0 0.100683158889 0.962493184852
1 0.0229545795711 0.99072513176
2 0.0515338859212 0.980554110709
3 0.010184966569 0.997242606739
4 0.064816459702 0.973973967701
5 0.0230279028474 0.992204097236
final 0.04553349225 0.982865516499
all eval None
(159571, 6) (153164, 6)
done


In [12]:
train_pred[:10].round(3)

array([[ 0.007,  0.   ,  0.001,  0.   ,  0.   ,  0.   ],
       [ 0.002,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.044,  0.   ,  0.005,  0.   ,  0.002,  0.   ],
       [ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.131,  0.001,  0.019,  0.002,  0.028,  0.002],
       [ 0.002,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.873,  0.017,  0.608,  0.008,  0.345,  0.012],
       [ 0.018,  0.   ,  0.003,  0.   ,  0.001,  0.   ],
       [ 0.007,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ],
       [ 0.007,  0.   ,  0.001,  0.   ,  0.001,  0.   ]])

In [13]:
y[:10]

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])