In [44]:
import tensorflow as tf
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import pandas as pd
import autoencoder
import model
from datetime import datetime
from datetime import timedelta

In [45]:
google = pd.read_csv('GOOG.csv')
eur_myr = pd.read_csv('eur-myr.csv')
usd_myr = pd.read_csv('usd-myr.csv')
oil = pd.read_csv('oil.csv')


In [46]:
google['oil_price'] = oil['Price']
google['oil_open'] = oil['Open']
google['oil_high'] = oil['High']
google['oil_low'] = oil['Low']
google['eur_myr'] = eur_myr['Unnamed: 1']
google['usd_myr'] = usd_myr['Unnamed: 1']

In [47]:
date_ori = pd.to_datetime(google.iloc[:, 0]).tolist()
google.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,oil_price,oil_open,oil_high,oil_low,eur_myr,usd_myr
0,2017-10-02,959.97998,962.539978,947.840027,953.27002,953.27002,1283400,54.27,54.26,54.39,54.22,4.926,4.226
1,2017-10-03,954.0,958.0,949.140015,957.789978,957.789978,888300,54.24,54.59,55.22,53.89,4.9232,4.232
2,2017-10-04,957.0,960.390015,950.690002,951.679993,951.679993,952400,54.38,54.08,54.85,53.93,4.9255,4.231
3,2017-10-05,955.48999,970.909973,955.179993,969.960022,969.960022,1213800,54.15,54.16,54.46,53.75,4.9239,4.238
4,2017-10-06,966.700012,979.460022,963.359985,978.890015,978.890015,1173900,53.9,52.8,54.2,52.25,4.9251,4.241


In [48]:
minmax = MinMaxScaler().fit(google.iloc[:, 4].values.reshape((-1,1)))
df_log = MinMaxScaler().fit_transform(google.iloc[:, 1:].astype('float32'))
df_log = pd.DataFrame(df_log)
df_log.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.094604,0.050227,0.0,0.021539,0.021539,0.092326,0.978389,0.938202,0.847145,1.0,0.033371,0.523804
1,0.0,0.0,0.01881,0.082768,0.082768,0.0,0.972495,1.0,1.0,0.935547,0.0,0.714279
2,0.047461,0.026442,0.041238,0.0,0.0,0.014979,1.0,0.904495,0.931859,0.943359,0.027409,0.682541
3,0.023572,0.142825,0.106207,0.247629,0.247629,0.076062,0.954814,0.919477,0.860036,0.908203,0.008343,0.904755
4,0.200917,0.237416,0.224568,0.368599,0.368599,0.066738,0.905698,0.664794,0.812155,0.615234,0.02264,1.0


In [49]:
thought_vector = autoencoder.reducedimension(df_log.values, 4, 0.001, 128, 100)

epoch: 10 loss: 0.27752203 time: 0.0004329681396484375
epoch: 20 loss: 0.2773618 time: 0.00043201446533203125
epoch: 30 loss: 0.2770898 time: 0.0004200935363769531
epoch: 40 loss: 0.27662754 time: 0.0004100799560546875
epoch: 50 loss: 0.27584228 time: 0.00041413307189941406
epoch: 60 loss: 0.27451512 time: 0.0004279613494873047
epoch: 70 loss: 0.27232823 time: 0.0004169940948486328
epoch: 80 loss: 0.26902166 time: 0.00041222572326660156
epoch: 90 loss: 0.2649478 time: 0.00041985511779785156
epoch: 100 loss: 0.26092687 time: 0.00038313865661621094




In [50]:
thought_vector.shape

(23, 4)

In [51]:
num_layers = 1
size_layer = 128
timestamp = 5
epoch = 500
dropout_rate = 0.1

In [52]:
tf.reset_default_graph()
modelnn = model.Model(0.01, num_layers, thought_vector.shape[1], size_layer, 1, dropout_rate)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
for i in range(epoch):
    init_value = np.zeros((1, num_layers * 2 * size_layer))
    total_loss = 0
    for k in range(0, (thought_vector.shape[0] // timestamp) * timestamp, timestamp):
        batch_x = np.expand_dims(thought_vector[k: k + timestamp, :], axis = 0)
        batch_y = df_log.values[k + 1: k + timestamp + 1, 3].reshape([-1, 1])
        last_state, _, loss = sess.run([modelnn.last_state, 
                                        modelnn.optimizer, 
                                        modelnn.cost], feed_dict={modelnn.X: batch_x, 
                                                                  modelnn.Y: batch_y, 
                                                                  modelnn.hidden_layer: init_value})
        init_value = last_state
        total_loss += loss
    total_loss /= (thought_vector.shape[0] // timestamp)
    if (i + 1) % 100 == 0:
        print('epoch:', i + 1, 'avg loss:', total_loss)

epoch: 100 avg loss: 0.15192137472331524
epoch: 200 avg loss: 0.05321213789284229
epoch: 300 avg loss: 0.06834725080989301
epoch: 400 avg loss: 0.031801844481378794
epoch: 500 avg loss: 0.04915535356849432


In [53]:
output_predict = np.zeros(((thought_vector.shape[0] // timestamp) * timestamp, 1))
init_value = np.zeros((1, num_layers * 2 * size_layer))
for k in range(0, (thought_vector.shape[0] // timestamp) * timestamp, timestamp):
    out_logits, last_state = sess.run([modelnn.logits, modelnn.last_state], feed_dict = {modelnn.X:np.expand_dims(thought_vector[k: k + timestamp, :], axis = 0),
                                     modelnn.hidden_layer: init_value})
    init_value = last_state
    output_predict[k: k + timestamp, :] = out_logits

In [54]:
print('Mean Square Error:', np.mean(np.square(output_predict[:, 0] - df_log.iloc[1: (thought_vector.shape[0] // timestamp) * timestamp + 1, 0].values)))

Mean Square Error: 0.026706518928794075


In [74]:
from itertools import product
from scipy import stats
    
Qs = range(0, 1)
qs = range(0, 2)
Ps = range(0, 2)
ps = range(0, 2)
D=1
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
best_aic = float("inf")
for param in parameters_list:
    try:
        arima=sm.tsa.statespace.SARIMAX(df_log.iloc[:,3].values, order=(param[0], D, param[1]), seasonal_order=(param[2], D, param[3], 1)).fit(disp=-1)
    except:
        continue
    aic = arima.aic
    if aic < best_aic and aic:
        best_arima = arima
        best_aic = aic
        
aic

NameError: name 'aic' is not defined

In [69]:
for param in parameters_list:
    try:
        arima=sm.tsa.statespace.SARIMAX(df_log.iloc[:,3].values, order=(param[0], D, param[1]), seasonal_order=(param[2], D, param[3], 1)).fit(disp=1)
    except:
        continue
    aic = arima.aic
    if aic < best_aic and aic:
        best_arima = arima
        best_aic = aic

In [70]:
def reverse_close(array):
    return minmax.inverse_transform(array.reshape((-1,1))).reshape((-1))

In [71]:
pred_arima = best_arima.predict()
x_range = np.arange(df_log.shape[0])
fig = plt.figure(figsize = (15,6))
ax = plt.subplot(111)
ax.plot(x_range, reverse_close(df_log.iloc[:,3].values), label = 'true Close')
ax.plot(x_range, reverse_close(pred_arima), label = 'predict Close using Arima')
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9])
ax.legend(loc = 'upper center', bbox_to_anchor= (0.5, -0.05), fancybox = True, shadow = True, ncol = 5)
plt.xticks(x_range[::5], date_ori[::5])
plt.title('overlap market Close')
plt.show()

NameError: name 'best_arima' is not defined