In [69]:
# load data
import numpy as np
import pandas as pd
from keras.layers import Embedding, LSTM, Dense, Flatten, Dropout
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.metrics import log_loss
import gc

train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

# replace
# train_df['text'] = train_df['text'].str.replace('[^a-zA-Z0-9]', ' ')
# test_df['text'] =test_df['text'].str.replace('[^a-zA-Z0-9]', ' ')

train_df['word_cnt'] = train_df['text'].apply(lambda x:len(str(x).split(' ')))
train_df['word_cnt'].describe()

count    19579.000000
mean        26.730477
std         19.048353
min          2.000000
25%         15.000000
50%         23.000000
75%         34.000000
max        861.000000
Name: word_cnt, dtype: float64

In [70]:
print(len(train_df))
print(len(train_df[train_df.word_cnt<20]))
print(len(train_df[train_df.word_cnt<10]))

19579
7672
2123


In [71]:
test_x = 'sdf .ksdfkl .'
print(test_x.find('.'),test_x.rfind('.'))

def find_p(s):
    punctuation = ['.',',', ':', ';', '-', '*', '"', '!', '?']
    s_len = len(s)
    for i in range(s_len):
        ch = s[i]
        if ch in punctuation:
            return i
    return None

tmp_s = 'hello world. kkkk'
idx = find_p(tmp_s)
print(tmp_s[idx+2:])

4 12
kkkk


In [72]:
def text_aug(x,y,aug_cnt=3):
    new_x,new_y = [],[]
    print(len(x))
    for tmp_x,tmp_y in zip(x,y):
        new_x.append(tmp_x)
        new_y.append(tmp_y)
        
        for i in range(aug_cnt):
            tmp_idx = find_p(tmp_x)
            if tmp_x is not None:
                tmp_x = tmp_x[tmp_idx+2:]
                word_cnt = len(tmp_x.split(' '))
                if word_cnt >= 20:
                    new_x.append(tmp_x)
                    new_y.append(tmp_y)
                else:
                    break
            else:
                break
    print(len(new_x))
    return new_x,new_y
            

In [73]:
def get_cnn_feats():
    # return train pred prob and test pred prob 
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 150
    NUM_CLASSES = 3
    MODEL_P = '/tmp/lstm.h5'
    
    X = train_df['text'].values
    Y = train_df['author'].values
    X_test = test_df['text'].values
    
    X,Y = text_aug(X,Y,10)

    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(X)

    train_x = tokenizer.texts_to_sequences(X)
    train_x = pad_sequences(train_x, maxlen=MAX_LEN)
    
    test_x = tokenizer.texts_to_sequences(X_test)
    test_x = pad_sequences(test_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(Y)

    train_y = lb.transform(Y)
    
    model = Sequential()
    model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
#     model.add(Conv1D(16,
#                      3,
#                      padding='valid',
#                      activation='relu',
#                      strides=1))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(30, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(NUM_CLASSES, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
    np.random.seed(42)
    model.fit(train_x, train_y, 
              validation_split=0.1,
              batch_size=64, epochs=20, 
              verbose=2,
              callbacks=[model_chk],
              shuffle=False
             )
    
    model = load_model(MODEL_P)
    train_pred = model.predict(train_x)
    test_pred = model.predict(test_x)
    del model
    gc.collect()
    print(log_loss(train_y,train_pred))
    return train_pred,test_pred

print('def cnn done')

def cnn done


In [74]:
cnn_train,cnn_test = get_cnn_feats()

19579
39387
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 150, 10)           300000    
_________________________________________________________________
global_average_pooling1d_11  (None, 10)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 30)                330       
_________________________________________________________________
dropout_11 (Dropout)         (None, 30)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 3)                 93        
Total params: 300,423
Trainable params: 300,423
Non-trainable params: 0
_________________________________________________________________
Train on 35448 samples, validate on 3939 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 1.03565, saving model