In [1]:
# load data
import numpy as np
import pandas as pd
from keras.layers import Embedding, LSTM, Dense, Flatten, Dropout
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.metrics import log_loss
import gc

train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

# replace
# train_df['text'] = train_df['text'].str.replace('[^a-zA-Z0-9]', ' ')
# test_df['text'] =test_df['text'].str.replace('[^a-zA-Z0-9]', ' ')



Using TensorFlow backend.


In [None]:
def get_cnn_feats():
    # return train pred prob and test pred prob 
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 150
    NUM_CLASSES = 3
    MODEL_P = '/tmp/lstm.h5'
    
    X = train_df['text']
    Y = train_df['author']
    X_test = test_df['text']

    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(X)

    train_x = tokenizer.texts_to_sequences(X)
    train_x = pad_sequences(train_x, maxlen=MAX_LEN)
    
    test_x = tokenizer.texts_to_sequences(X_test)
    test_x = pad_sequences(test_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(Y)

    train_y = lb.transform(Y)
    
    model = Sequential()
    model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
    model.add(Conv1D(16,
                     3,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(NUM_CLASSES, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
    np.random.seed(42)
    model.fit(train_x, train_y, 
              validation_split=0.1,
              batch_size=64, epochs=10, 
              verbose=2,
              callbacks=[model_chk],
              shuffle=False
             )
    
    model = load_model(MODEL_P)
    train_pred = model.predict(train_x)
    test_pred = model.predict(test_x)
    del model
    gc.collect()
    print(log_loss(train_y,train_pred))
    return train_pred,test_pred

print('def cnn done')

In [2]:
cnn_train,cnn_test = get_cnn_feats()

def cnn done
