In [1]:
import pandas as pd
import numpy as np
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

# trans text

In [2]:
def trans_text(s):
    s = s.replace("' ", " ' ")
    p = ['. ', '; ', '" ', '! ', '? ', ', ',': ']
    for tmp_p in p:
        s=s.replace(tmp_p,' {}'.format(tmp_p))
    s = s.lower()
    return s

print(train_df['text'][10])
train_df['text'] = train_df['text'].apply(trans_text)
test_df['text'] = test_df['text'].apply(trans_text)
print(train_df['text'][10])

He shall find that I can feel my injuries; he shall learn to dread my revenge" A few days after he arrived.
he shall find that i can feel my injuries ; he shall learn to dread my revenge " a few days after he arrived.


In [3]:
# add cnn feat
from keras.layers import Embedding, CuDNNLSTM, Dense, Flatten, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
import gc
print('import keras done')

Using TensorFlow backend.


import keras done


In [4]:
def get_cnn_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 100
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)

    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=233*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(Conv1D(16,
                         3,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(16, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.1,
                  batch_size=32, epochs=10, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def cnn done')

cnn_train1,cnn_test1,cnn_train2,cnn_test2 = get_cnn_feats(1)

def cnn done
Train on 14096 samples, validate on 1567 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.94816, saving model to /tmp/nn_model.h5
 - 2s - loss: 1.0577 - acc: 0.4384 - val_loss: 0.9482 - val_acc: 0.5552
Epoch 2/10
Epoch 00002: val_loss improved from 0.94816 to 0.69040, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.7869 - acc: 0.6284 - val_loss: 0.6904 - val_acc: 0.6886
Epoch 3/10
Epoch 00003: val_loss improved from 0.69040 to 0.54352, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.5316 - acc: 0.7812 - val_loss: 0.5435 - val_acc: 0.7728
Epoch 4/10
Epoch 00004: val_loss improved from 0.54352 to 0.49323, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.3550 - acc: 0.8710 - val_loss: 0.4932 - val_acc: 0.8130
Epoch 5/10
Epoch 00005: val_loss improved from 0.49323 to 0.49017, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2503 - acc: 0.9183 - val_loss: 0.4902 - val_acc: 0.8194
Epoch 6/10
Epoch 00006: val_loss did not improve
 - 1s - loss: 0.1858 - acc: 0.938

In [5]:
# add lstm feat
def get_lstm_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 16000
    N = 12
    MAX_LEN = 300
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)

    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=233*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(LSTM(N, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
        model.add(Flatten())
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.1,
                  batch_size=128, epochs=10, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/FEAT_CNT
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def lstm done')
lstm_train1,lstm_test1,lstm_train2,lstm_test2 = get_lstm_feats(1)

def lstm done
Train on 14096 samples, validate on 1567 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.84670, saving model to /tmp/nn_model.h5
 - 40s - loss: 1.0256 - acc: 0.4708 - val_loss: 0.8467 - val_acc: 0.5852
Epoch 2/10
Epoch 00002: val_loss improved from 0.84670 to 0.52045, saving model to /tmp/nn_model.h5
 - 38s - loss: 0.6458 - acc: 0.7157 - val_loss: 0.5204 - val_acc: 0.7837
Epoch 3/10
Epoch 00003: val_loss improved from 0.52045 to 0.46248, saving model to /tmp/nn_model.h5
 - 38s - loss: 0.3423 - acc: 0.8727 - val_loss: 0.4625 - val_acc: 0.8124
Epoch 4/10
Epoch 00004: val_loss did not improve
 - 38s - loss: 0.2119 - acc: 0.9256 - val_loss: 0.4761 - val_acc: 0.8309
Epoch 5/10
Epoch 00005: val_loss did not improve
 - 38s - loss: 0.1467 - acc: 0.9492 - val_loss: 0.5089 - val_acc: 0.8290
Epoch 6/10
Epoch 00006: val_loss did not improve
 - 38s - loss: 0.1080 - acc: 0.9642 - val_loss: 0.5837 - val_acc: 0.8271
Epoch 7/10
Epoch 00007: val_loss did not improve
 - 38s 

In [11]:
def get_nn_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 100
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    lb = preprocessing.LabelBinarizer()
    tmp_Y = lb.fit_transform(tmp_Y)
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)
    
    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=2331*rnd)
    for train_index, test_index in kf.split(tmp_X):
        # prepare aug train, val      
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(50, activation='relu'))
        model.add(Dropout(0.1))
        model.add(Dense(NUM_CLASSES, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.1,
                  batch_size=64, epochs=15, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/FEAT_CNT
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def cnn done')

nn_train1,nn_test1,nn_train2,nn_test2 = get_nn_feats(4)



all_nn_train = np.hstack([lstm_train1, lstm_train2, 
                          cnn_train1, cnn_train2,
                          nn_train1,nn_train2
                         ])
all_nn_test = np.hstack([lstm_test1, lstm_test2, 
                         cnn_test1, cnn_test2,
                         nn_test1,nn_test2
])

def cnn done
Train on 14096 samples, validate on 1567 samples
Epoch 1/15
Epoch 00001: val_loss improved from inf to 1.04838, saving model to /tmp/nn_model.h5
 - 3s - loss: 1.0796 - acc: 0.4047 - val_loss: 1.0484 - val_acc: 0.4601
Epoch 2/15
Epoch 00002: val_loss improved from 1.04838 to 0.84327, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.9460 - acc: 0.5522 - val_loss: 0.8433 - val_acc: 0.6165
Epoch 3/15
Epoch 00003: val_loss improved from 0.84327 to 0.64991, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.7044 - acc: 0.7246 - val_loss: 0.6499 - val_acc: 0.7549
Epoch 4/15
Epoch 00004: val_loss improved from 0.64991 to 0.52142, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.5012 - acc: 0.8296 - val_loss: 0.5214 - val_acc: 0.8060
Epoch 5/15
Epoch 00005: val_loss improved from 0.52142 to 0.45720, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.3725 - acc: 0.8746 - val_loss: 0.4572 - val_acc: 0.8168
Epoch 6/15
Epoch 00006: val_loss improved from 0.45720 to 0.42354, saving model to

Epoch 8/15
Epoch 00008: val_loss improved from 0.40737 to 0.39996, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.1943 - acc: 0.9398 - val_loss: 0.4000 - val_acc: 0.8468
Epoch 9/15
Epoch 00009: val_loss did not improve
 - 1s - loss: 0.1614 - acc: 0.9499 - val_loss: 0.4004 - val_acc: 0.8481
Epoch 10/15
Epoch 00010: val_loss did not improve
 - 1s - loss: 0.1358 - acc: 0.9601 - val_loss: 0.4060 - val_acc: 0.8519
Epoch 11/15
Epoch 00011: val_loss did not improve
 - 1s - loss: 0.1166 - acc: 0.9662 - val_loss: 0.4148 - val_acc: 0.8494
Epoch 12/15
Epoch 00012: val_loss did not improve
 - 1s - loss: 0.0990 - acc: 0.9721 - val_loss: 0.4237 - val_acc: 0.8481
Epoch 13/15
Epoch 00013: val_loss did not improve
 - 1s - loss: 0.0852 - acc: 0.9767 - val_loss: 0.4379 - val_acc: 0.8475
Epoch 14/15
Epoch 00014: val_loss did not improve
 - 1s - loss: 0.0740 - acc: 0.9804 - val_loss: 0.4591 - val_acc: 0.8430
Epoch 15/15
Epoch 00015: val_loss did not improve
 - 1s - loss: 0.0638 - acc: 0.9838 - val_loss: 

In [12]:
import pickle
with open('nn_feat.pkl','wb') as fout:
    pickle.dump([all_nn_train,all_nn_test],fout)