In [5]:
import pandas as pd
import numpy as np
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

# trans text

In [6]:
def trans_text(s):
    s = s.replace("' ", " ' ")
    p = ['. ', '; ', '" ', '! ', '? ', ', ',': ']
    for tmp_p in p:
        s=s.replace(tmp_p,' {}'.format(tmp_p))
    s = s.lower()
    words = s.split(' ')
    word_cnt = len(words)
    new_words = []
    for i in range(word_cnt-1):
        new_words.append('{}--{}'.format(words[i],words[i+1]))
    return ' '.join(new_words)

print(train_df['text'][10])
train_df['text'] = train_df['text'].apply(trans_text)
test_df['text'] = test_df['text'].apply(trans_text)
print(train_df['text'][10])

He shall find that I can feel my injuries; he shall learn to dread my revenge" A few days after he arrived.
he--shall shall--find find--that that--i i--can can--feel feel--my my--injuries injuries--; ;--he he--shall shall--learn learn--to to--dread dread--my my--revenge revenge--" "--a a--few few--days days--after after--he he--arrived.


In [7]:
# add cnn feat
from keras.layers import Embedding, CuDNNLSTM, Dense, Flatten, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import gc
print('import keras done')

Using TensorFlow backend.


import keras done


In [8]:
def get_cnn_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 100
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)

    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=233*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(Conv1D(16,
                         3,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(16, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.1,
                  batch_size=32, epochs=10, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def cnn done')

cnn_train1,cnn_test1,cnn_train2,cnn_test2 = get_cnn_feats(1)

def cnn done
Train on 14096 samples, validate on 1567 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.80816, saving model to /tmp/nn_model.h5
 - 2s - loss: 1.0086 - acc: 0.4860 - val_loss: 0.8082 - val_acc: 0.6784
Epoch 2/10
Epoch 00002: val_loss improved from 0.80816 to 0.51656, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.6132 - acc: 0.7602 - val_loss: 0.5166 - val_acc: 0.7996
Epoch 3/10
Epoch 00003: val_loss improved from 0.51656 to 0.44420, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.3716 - acc: 0.8734 - val_loss: 0.4442 - val_acc: 0.8168
Epoch 4/10
Epoch 00004: val_loss did not improve
 - 2s - loss: 0.2524 - acc: 0.9166 - val_loss: 0.4457 - val_acc: 0.8226
Epoch 5/10
Epoch 00005: val_loss did not improve
 - 1s - loss: 0.1791 - acc: 0.9437 - val_loss: 0.4683 - val_acc: 0.8232
Epoch 6/10
Epoch 00006: val_loss did not improve
 - 1s - loss: 0.1333 - acc: 0.9595 - val_loss: 0.5136 - val_acc: 0.8232
Epoch 7/10
Epoch 00007: val_loss did not improve
 - 1s - loss: 

In [9]:
# add lstm feat
def get_lstm_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 16000
    N = 12
    MAX_LEN = 300
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)

    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=233*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(LSTM(N, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
        model.add(Flatten())
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.1,
                  batch_size=128, epochs=10, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def lstm done')
lstm_train1,lstm_test1,lstm_train2,lstm_test2 = get_lstm_feats(1)

def lstm done
Train on 14096 samples, validate on 1567 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.73654, saving model to /tmp/nn_model.h5
 - 40s - loss: 0.9829 - acc: 0.5041 - val_loss: 0.7365 - val_acc: 0.6911
Epoch 2/10
Epoch 00002: val_loss improved from 0.73654 to 0.48577, saving model to /tmp/nn_model.h5
 - 39s - loss: 0.5336 - acc: 0.7875 - val_loss: 0.4858 - val_acc: 0.8015
Epoch 3/10
Epoch 00003: val_loss improved from 0.48577 to 0.47234, saving model to /tmp/nn_model.h5
 - 39s - loss: 0.2979 - acc: 0.8903 - val_loss: 0.4723 - val_acc: 0.8105
Epoch 4/10
Epoch 00004: val_loss did not improve
 - 39s - loss: 0.1934 - acc: 0.9321 - val_loss: 0.5211 - val_acc: 0.8060
Epoch 5/10
Epoch 00005: val_loss did not improve
 - 39s - loss: 0.1380 - acc: 0.9513 - val_loss: 0.5457 - val_acc: 0.8168
Epoch 6/10
Epoch 00006: val_loss did not improve
 - 40s - loss: 0.1115 - acc: 0.9609 - val_loss: 0.6049 - val_acc: 0.8105
Epoch 7/10
Epoch 00007: val_loss did not improve
 - 40s 

In [10]:
def get_nn_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 100
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    lb = preprocessing.LabelBinarizer()
    tmp_Y = lb.fit_transform(tmp_Y)
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)
    
    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=2331*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val      
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(50, activation='relu'))
        model.add(Dropout(0.1))
        model.add(Dense(NUM_CLASSES, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.1,
                  batch_size=64, epochs=15, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def cnn done')

nn_train1,nn_test1,nn_train2,nn_test2 = get_nn_feats(4)



all_nn_train = np.hstack([lstm_train1, lstm_train2, 
                          cnn_train1, cnn_train2,
                          nn_train1,nn_train2
                         ])
all_nn_test = np.hstack([lstm_test1, lstm_test2, 
                         cnn_test1, cnn_test2,
                         nn_test1,nn_test2
])

def cnn done
Train on 14096 samples, validate on 1567 samples
Epoch 1/15
Epoch 00001: val_loss improved from inf to 0.99401, saving model to /tmp/nn_model.h5
 - 3s - loss: 1.0648 - acc: 0.4279 - val_loss: 0.9940 - val_acc: 0.5488
Epoch 2/15
Epoch 00002: val_loss improved from 0.99401 to 0.72789, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.8420 - acc: 0.6427 - val_loss: 0.7279 - val_acc: 0.7052
Epoch 3/15
Epoch 00003: val_loss improved from 0.72789 to 0.55303, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.5654 - acc: 0.8021 - val_loss: 0.5530 - val_acc: 0.7843
Epoch 4/15
Epoch 00004: val_loss improved from 0.55303 to 0.47290, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.3903 - acc: 0.8707 - val_loss: 0.4729 - val_acc: 0.8149
Epoch 5/15
Epoch 00005: val_loss improved from 0.47290 to 0.43808, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2901 - acc: 0.9054 - val_loss: 0.4381 - val_acc: 0.8302
Epoch 6/15
Epoch 00006: val_loss improved from 0.43808 to 0.42626, saving model to

Epoch 12/15
Epoch 00012: val_loss did not improve
 - 1s - loss: 0.0656 - acc: 0.9833 - val_loss: 0.5375 - val_acc: 0.8092
Epoch 13/15
Epoch 00013: val_loss did not improve
 - 1s - loss: 0.0557 - acc: 0.9860 - val_loss: 0.5603 - val_acc: 0.8086
Epoch 14/15
Epoch 00014: val_loss did not improve
 - 1s - loss: 0.0487 - acc: 0.9874 - val_loss: 0.5859 - val_acc: 0.8079
Epoch 15/15
Epoch 00015: val_loss did not improve
 - 1s - loss: 0.0406 - acc: 0.9903 - val_loss: 0.6099 - val_acc: 0.8054
------------------
Train on 14097 samples, validate on 1567 samples
Epoch 1/15
Epoch 00001: val_loss improved from inf to 1.00504, saving model to /tmp/nn_model.h5
 - 3s - loss: 1.0676 - acc: 0.4219 - val_loss: 1.0050 - val_acc: 0.5482
Epoch 2/15
Epoch 00002: val_loss improved from 1.00504 to 0.68632, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.8187 - acc: 0.6811 - val_loss: 0.6863 - val_acc: 0.7422
Epoch 3/15
Epoch 00003: val_loss improved from 0.68632 to 0.53039, saving model to /tmp/nn_model.h5
 - 1

In [11]:
import pickle
with open('nn_2gram_feat.pkl','wb') as fout:
    pickle.dump([all_nn_train,all_nn_test],fout)