In [1]:
import pandas as pd
import numpy as np
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

In [2]:
# add cnn feat
from keras.layers import Embedding, CuDNNLSTM, Dense, Flatten, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import gc
print('import keras done')

Using TensorFlow backend.


import keras done


In [3]:
def find_p(s):
    punctuation = ['.', ';', '"', '!', '?']
    s_len = len(s)
    for i in range(s_len):
        ch = s[i]
        if ch in punctuation:
            return i
    return None

def text_aug(x,y,aug_cnt=5):
    new_x,new_y = [],[]
    #print(len(x))
    for tmp_x,tmp_y in zip(x,y):
        new_x.append(tmp_x)
        new_y.append(tmp_y)
        
        for i in range(aug_cnt):
            tmp_idx = find_p(tmp_x)
            if tmp_x is not None:
                tmp_x = tmp_x[tmp_idx+2:]
                word_cnt = len(tmp_x.split(' '))
                if word_cnt >= 20:
                    new_x.append(tmp_x)
                    new_y.append(tmp_y)
                else:
                    break
            else:
                break
    #print(len(new_x))
    return new_x,np.array(new_y)

org_X, org_Y = train_df['text'].values,train_df['author'].values



In [4]:
def get_cnn_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 3
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 100
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = org_X
    tmp_Y = org_Y
    tmp_X_test = test_df['text']
    
    lb = preprocessing.LabelBinarizer()
    tmp_Y = lb.fit_transform(tmp_Y)
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)
    
    # test
    final_test_x = tokenizer.texts_to_sequences(tmp_X_test)
    final_test_x = pad_sequences(final_test_x, maxlen=MAX_LEN)
    
    # train + valid
    all_train_x = tokenizer.texts_to_sequences(tmp_X)
    all_train_x = pad_sequences(all_train_x, maxlen=MAX_LEN)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=233*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val
        aug_X, aug_Y = text_aug(tmp_X[train_index],tmp_Y[train_index])
        val_X, val_Y = all_train_x[test_index],tmp_Y[test_index]
        
        curr_train_x = tokenizer.texts_to_sequences(aug_X)
        curr_train_x = pad_sequences(curr_train_x, maxlen=MAX_LEN)
      
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(Conv1D(16,
                         3,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(16, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(curr_train_x, aug_Y, 
                  validation_data=(val_X,val_Y),
                  batch_size=32, epochs=10, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(val_X)
        test_pred += model.predict(final_test_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(val_X)
        best_val_test_pred += model.predict(final_test_x)/FEAT_CNT
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def cnn done')

cnn_train1,cnn_test1,cnn_train2,cnn_test2 = get_cnn_feats(1)

def cnn done
Train on 16230 samples, validate on 6527 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.87968, saving model to /tmp/nn_model.h5
 - 2s - loss: 1.0309 - acc: 0.4682 - val_loss: 0.8797 - val_acc: 0.5761
Epoch 2/10
Epoch 00002: val_loss improved from 0.87968 to 0.57381, saving model to /tmp/nn_model.h5
 - 2s - loss: 0.6544 - acc: 0.7134 - val_loss: 0.5738 - val_acc: 0.7763
Epoch 3/10
Epoch 00003: val_loss improved from 0.57381 to 0.46719, saving model to /tmp/nn_model.h5
 - 2s - loss: 0.3622 - acc: 0.8697 - val_loss: 0.4672 - val_acc: 0.8209
Epoch 4/10
Epoch 00004: val_loss improved from 0.46719 to 0.45567, saving model to /tmp/nn_model.h5
 - 2s - loss: 0.2265 - acc: 0.9225 - val_loss: 0.4557 - val_acc: 0.8275
Epoch 5/10
Epoch 00005: val_loss did not improve
 - 2s - loss: 0.1584 - acc: 0.9463 - val_loss: 0.4784 - val_acc: 0.8263
Epoch 6/10
Epoch 00006: val_loss did not improve
 - 2s - loss: 0.1133 - acc: 0.9640 - val_loss: 0.5207 - val_acc: 0.8279
Epoch 7/10
E

In [5]:
# add lstm feat
def get_lstm_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 3
    NUM_WORDS = 16000
    N = 12
    MAX_LEN = 300
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = org_X
    tmp_Y = org_Y
    tmp_X_test = test_df['text']
    
    lb = preprocessing.LabelBinarizer()
    tmp_Y = lb.fit_transform(tmp_Y)
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)
    
    # test
    final_test_x = tokenizer.texts_to_sequences(tmp_X_test)
    final_test_x = pad_sequences(final_test_x, maxlen=MAX_LEN)
    
    # train + valid
    all_train_x = tokenizer.texts_to_sequences(tmp_X)
    all_train_x = pad_sequences(all_train_x, maxlen=MAX_LEN)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val
        aug_X, aug_Y = text_aug(tmp_X[train_index],tmp_Y[train_index])
        val_X, val_Y = all_train_x[test_index],tmp_Y[test_index]
        
        curr_train_x = tokenizer.texts_to_sequences(aug_X)
        curr_train_x = pad_sequences(curr_train_x, maxlen=MAX_LEN)
        
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(LSTM(N, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
        model.add(Flatten())
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(curr_train_x, aug_Y, 
                  validation_data=(val_X,val_Y),
                  batch_size=128, epochs=10, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(val_X)
        test_pred += model.predict(final_test_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(val_X)
        best_val_test_pred += model.predict(final_test_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def lstm done')
lstm_train1,lstm_test1,lstm_train2,lstm_test2 = get_lstm_feats(1)

def lstm done
Train on 16145 samples, validate on 6527 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.81542, saving model to /tmp/nn_model.h5
 - 49s - loss: 1.0117 - acc: 0.4718 - val_loss: 0.8154 - val_acc: 0.6545
Epoch 2/10
Epoch 00002: val_loss improved from 0.81542 to 0.49112, saving model to /tmp/nn_model.h5
 - 48s - loss: 0.5651 - acc: 0.7664 - val_loss: 0.4911 - val_acc: 0.8065
Epoch 3/10
Epoch 00003: val_loss improved from 0.49112 to 0.45854, saving model to /tmp/nn_model.h5
 - 48s - loss: 0.2856 - acc: 0.8920 - val_loss: 0.4585 - val_acc: 0.8249
Epoch 4/10
Epoch 00004: val_loss did not improve
 - 47s - loss: 0.1796 - acc: 0.9356 - val_loss: 0.4856 - val_acc: 0.8266
Epoch 5/10
Epoch 00005: val_loss did not improve
 - 46s - loss: 0.1196 - acc: 0.9579 - val_loss: 0.5575 - val_acc: 0.8197
Epoch 6/10
Epoch 00006: val_loss did not improve
 - 46s - loss: 0.0917 - acc: 0.9694 - val_loss: 0.5956 - val_acc: 0.8149
Epoch 7/10
Epoch 00007: val_loss did not improve
 - 48s 

In [8]:
def get_nn_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 3
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 100
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = org_X
    tmp_Y = org_Y
    tmp_X_test = test_df['text']
    
    lb = preprocessing.LabelBinarizer()
    tmp_Y = lb.fit_transform(tmp_Y)
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)
    
    # test
    final_test_x = tokenizer.texts_to_sequences(tmp_X_test)
    final_test_x = pad_sequences(final_test_x, maxlen=MAX_LEN)
    
    # train + valid
    all_train_x = tokenizer.texts_to_sequences(tmp_X)
    all_train_x = pad_sequences(all_train_x, maxlen=MAX_LEN)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=233*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val
        aug_X, aug_Y = text_aug(tmp_X[train_index],tmp_Y[train_index])
        val_X, val_Y = all_train_x[test_index],tmp_Y[test_index]
        
        curr_train_x = tokenizer.texts_to_sequences(aug_X)
        curr_train_x = pad_sequences(curr_train_x, maxlen=MAX_LEN)
        
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(0.1))
        model.add(Dense(NUM_CLASSES, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(curr_train_x, aug_Y, 
                  validation_data=(val_X,val_Y),
                  batch_size=64, epochs=15, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(val_X)
        test_pred += model.predict(final_test_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(val_X)
        best_val_test_pred += model.predict(final_test_x)/FEAT_CNT
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def cnn done')

nn_train1,nn_test1,nn_train2,nn_test2 = get_nn_feats(4)



all_nn_train = np.hstack([lstm_train1, lstm_train2, 
                          cnn_train1, cnn_train2,
                          nn_train1,nn_train2
                         ])
all_nn_test = np.hstack([lstm_test1, lstm_test2, 
                         cnn_test1, cnn_test2,
                         nn_test1,nn_test2
])

def cnn done
Train on 16191 samples, validate on 6527 samples
Epoch 1/15
Epoch 00001: val_loss improved from inf to 1.05355, saving model to /tmp/nn_model.h5
 - 2s - loss: 1.0805 - acc: 0.4075 - val_loss: 1.0536 - val_acc: 0.4521
Epoch 2/15
Epoch 00002: val_loss improved from 1.05355 to 0.87596, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.9563 - acc: 0.5659 - val_loss: 0.8760 - val_acc: 0.5848
Epoch 3/15
Epoch 00003: val_loss improved from 0.87596 to 0.68718, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.7226 - acc: 0.6995 - val_loss: 0.6872 - val_acc: 0.7371
Epoch 4/15
Epoch 00004: val_loss improved from 0.68718 to 0.55316, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.5114 - acc: 0.8282 - val_loss: 0.5532 - val_acc: 0.7929
Epoch 5/15
Epoch 00005: val_loss improved from 0.55316 to 0.48710, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.3731 - acc: 0.8812 - val_loss: 0.4871 - val_acc: 0.8114
Epoch 6/15
Epoch 00006: val_loss improved from 0.48710 to 0.44878, saving model to

In [9]:
import pickle
with open('nn_feat.pkl','wb') as fout:
    pickle.dump([all_nn_train,all_nn_test],fout)