In [None]:
# coding: utf-8
import time
import os
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.utils import simple_preprocess

import keras
from keras import metrics
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.layers.embeddings import Embedding

from datetime import datetime
from collections import *
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
%run common.ipynb
%run word2vec_loader.ipynb

In [None]:
# Read pickle and split training and testing sets
df = pd.read_pickle('train_reduce.pkl')
train_df, test_df = train_test_split(df, test_size=0.8, shuffle=False)
train_df, test_df = test_df, train_df

print('Rows of dataset=%d, training set rows=%s, testing set rows=%d' % (len(df), len(train_df), len(test_df)))

In [None]:
# Generate word embedding matrix and word2idx dict
embedding_matrix, word2idx = createEmbeddingMatrix(EMBEDDING_DIM)

In [None]:
# Create training inputs
x_texts_idx = text_to_index(train_df['ari_title'], word2idx)
x_texts_idx_pad = pad_sequences(x_texts_idx, maxlen=MAX_WORD_COUNT)
X_train = x_texts_idx_pad
# X_train = np.insert(x_texts_idx_pad, 0, train_df['ari_time'].values, axis=1)

Y_train = train_df['price']
assert(len(X_train) == len(Y_train))
print(X_train.shape, Y_train.shape)

In [None]:
# Model definition
model = Sequential()
# input_dim=embedding_matrix.shape[0]
embedding_layer = Embedding(input_dim=embedding_matrix.shape[0], output_dim=EMBEDDING_DIM, weights=[embedding_matrix], 
                            input_length=MAX_WORD_COUNT, trainable=False)
model.add(embedding_layer)

# model.add(LSTM(16))
# model.add(Dense(40, activation='relu'))
# model.add(Dense(20, activation='relu'))

# model.add(LSTM(128, return_sequences=True))
# model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(16))
model.add(Dropout(0.25))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

# prev: optimizer=adam
# rmsprop: default lr=0.001
# rmsprop2 = keras.optimizers.RMSprop(lr=0.05)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
init_epoch = 0
num_epochs = 200
num_batch_size = 100  # 100

# Load previous model
# init_epoch = 98
# model_path = 'models_intermediate/epoch%04d.h5' % init_epoch
# print('Loadinig model', model_path)
# model = load_model(model_path)

# Model checkpoint
filepath="models_intermediate/epoch{epoch:04d}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model.fit(x=X_train, y=Y_train, batch_size=num_batch_size, 
                    initial_epoch=init_epoch, epochs=num_epochs, 
                    callbacks=callbacks_list, verbose=1)

In [None]:
model.save('models/my_model.h5')

In [None]:
# Plot accuracy and loss chart
np_loss_history = np.array(history.history['loss'])
np.savetxt("loss_history.txt", np_loss_history, delimiter=",")
loss_history = np.loadtxt("loss_history.txt")

np_acc_history = np.array(history.history['acc'])
np.savetxt("acc_history.txt", np_acc_history, delimiter=",")
acc_history = np.loadtxt("acc_history.txt")

print("accuracy 準確度: ")
fig = plt.figure(1)
ax = plt.axes()
x = np.linspace(0, num_epochs, acc_history.shape[0])
plt.plot(x, acc_history, '-r');  # dotted red
plt.show()


print("loss 損失函數: ")
fig = plt.figure(2)
ax = plt.axes()
x = np.linspace(0, num_epochs, loss_history.shape[0])
plt.plot(x, loss_history, '-g');  # dotted red
plt.show()

In [None]:
##### Model Evaluation #####

In [None]:
# Model evaluation by training data
n_first = len(X_train)
loss_accuracy = model.evaluate(X_train[0:n_first], Y_train[0:n_first], verbose=1)
print('loss=%.4f, accuracy=%.4f' % (loss_accuracy[0], loss_accuracy[1]))

print(X_train)

In [None]:
# Create testing inputs
x_test_texts_idx = text_to_index(test_df['ari_title'], word2idx)
X_test = pad_sequences(x_test_texts_idx, maxlen=MAX_WORD_COUNT)

Y_test = test_df['price']
assert(len(X_test) == len(Y_test))
print(X_test.shape, Y_test.shape)

In [None]:
# Model evaluation by testing data
n_first = len(X_test)
loss_accuracy = model.evaluate(X_test[0:n_first], Y_test[0:n_first], verbose=1)
print('loss=%.4f, accuracy=%.4f' % (loss_accuracy[0], loss_accuracy[1]))