In [30]:
# load data
import numpy as np
import pandas as pd
from keras.layers import Embedding, LSTM, Dense, Flatten, Dropout
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.metrics import log_loss

train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

# replace
# train_df['text'] = train_df['text'].str.replace('[^a-zA-Z0-9]', ' ')
# test_df['text'] =test_df['text'].str.replace('[^a-zA-Z0-9]', ' ')



In [31]:
def get_cnn_feats():
    # return train pred prob and test pred prob 
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 150
    NUM_CLASSES = 3
    MODEL_P = '/tmp/lstm.h5'
    
    X = train_df['text']
    Y = train_df['author']
    X_test = test_df['text']

    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(X)

    train_x = tokenizer.texts_to_sequences(X)
    train_x = pad_sequences(train_x, maxlen=MAX_LEN)
    
    test_x = tokenizer.texts_to_sequences(X_test)
    test_x = pad_sequences(test_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(Y)

    train_y = lb.transform(Y)
    
    model = Sequential()
    model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
    model.add(Conv1D(16,
                     3,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(NUM_CLASSES, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
    np.random.seed(42)
    model.fit(train_x, train_y, 
              validation_split=0.1,
              batch_size=64, epochs=10, 
              verbose=2,
              callbacks=[model_chk],
              shuffle=False
             )
    
    model = load_model(MODEL_P)
    train_pred = model.predict(train_x)
    test_pred = model.predict(test_x)
    print(log_loss(train_y,train_pred))
    return train_pred,test_pred

print('def cnn done')
cnn_train,cnn_test = get_cnn_feats()

def cnn done
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 150, 10)           300000    
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 148, 16)           496       
_________________________________________________________________
global_average_pooling1d_26  (None, 16)                0         
_________________________________________________________________
dense_54 (Dense)             (None, 16)                272       
_________________________________________________________________
dropout_28 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_55 (Dense)             (None, 3)                 51        
Total params: 300,819
Trainable params: 300,819
Non-trainable params: 0
_________________________________________________________

In [4]:
def get_lstm_feats():
    # return train pred prob and test pred prob 
    NUM_WORDS = 10000
    N = 10
    MAX_LEN = 200
    NUM_CLASSES = 3
    MODEL_P = '/tmp/lstm.h5'
    
    X = train_df['text']
    Y = train_df['author']
    X_test = test_df['text']

    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(X)

    train_x = tokenizer.texts_to_sequences(X)
    train_x = pad_sequences(train_x, maxlen=MAX_LEN)
    
    test_x = tokenizer.texts_to_sequences(X_test)
    test_x = pad_sequences(test_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(Y)

    train_y = lb.transform(Y)
    
    model = Sequential()
    model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
    model.add(LSTM(N, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(NUM_CLASSES, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
    model.fit(train_x, train_y, 
              validation_split=0.1,
              batch_size=64, epochs=6, 
              verbose=2,
              callbacks=[model_chk]
             )
    
    model = load_model(MODEL_P)
    train_pred = model.predict(train_x)
    test_pred = model.predict(test_x)
    print(log_loss(train_y,train_pred))
    return train_pred,test_pred

print('def lstm done')

lstm_train,lstm_test = get_lstm_feats()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 10)           100000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 10)           840       
_________________________________________________________________
flatten_1 (Flatten)          (None, 2000)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 2000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 6003      
Total params: 106,843
Trainable params: 106,843
Non-trainable params: 0
_________________________________________________________________
Train on 17621 samples, validate on 1958 samples
Epoch 1/6


KeyboardInterrupt: 