In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Locate the data
data = '../Nikkei Std.csv'
# Read data with pandas
dataset = pd.read_csv(data)
# Extract the values of the desired column
full_set = dataset['NIK_Returns_Rolling_Std'].values
dataset

Unnamed: 0.1,Unnamed: 0,Date,NIKKEI_225,NIK_Returns_Rolling_Std
0,0,1979-12-28,0.622472,
1,1,1980-01-04,-0.141717,
2,2,1980-01-07,-0.114483,
3,3,1980-01-08,0.162223,
4,4,1980-01-09,0.435000,
...,...,...,...,...
10113,10113,2021-02-08,2.117191,0.959234
10114,10114,2021-02-09,0.399577,0.958767
10115,10115,2021-02-10,0.193182,0.953750
10116,10116,2021-02-12,-0.144977,0.954752


In [3]:
# We split our data on the 7200 mark as it represents 70% of it
train_data = full_set[5061:9061]
test_data = full_set[9061:]
# dataset.iloc[5061]

In [4]:
# Data Normalization
from sklearn.preprocessing import MinMaxScaler
# Data must be reshaped in order to use fit_transform
training_set = train_data.reshape(-1,1)
test_set = test_data.reshape(-1,1)

sc = MinMaxScaler(feature_range=(0,1))

X_train_scaled = sc.fit_transform(training_set)

In [5]:
# Create lists for x and y train
X_train = []

y_train = []

# We will run our model on 100 timesteps
for i in range(20,4000):
    
    X_train.append(X_train_scaled[i- 20:i,0])

    y_train.append(X_train_scaled[i, 0])

X_train, y_train = np.array(X_train), np.array(y_train)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))


In [6]:
# Import the necessary models from keras within tensor flow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
import tensorflow as tf

In [7]:
# Create a callback function with keras, which will monitor thr rmse of each epoch 
# And stop our model on the epoch that offers the best balance so we don't overfit our model
callback = tf.keras.callbacks.EarlyStopping(monitor='root_mean_squared_error', patience=3)

In [8]:
# Create the layers of our model droping out and replenishing 20% of our data after each layer
model = Sequential()
model.add(LSTM(units=50,return_sequences=True,input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1))
# Include a printout of the model summary 
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 20, 50)            10400     
_________________________________________________________________
dropout (Dropout)            (None, 20, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 50)            20200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 50)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 20, 50)            20200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 20, 50)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                2

In [9]:
model.compile(optimizer='adam',loss= 'mean_squared_error',
             metrics=[tf.keras.metrics.RootMeanSquaredError()])
# While we git our model we assign it to a variable 
history = model.fit(X_train,y_train,epochs=100,batch_size=100,callbacks=[callback], verbose = 2)

Epoch 1/100
40/40 - 12s - loss: 0.0080 - root_mean_squared_error: 0.0897
Epoch 2/100
40/40 - 2s - loss: 0.0028 - root_mean_squared_error: 0.0526
Epoch 3/100
40/40 - 2s - loss: 0.0023 - root_mean_squared_error: 0.0483
Epoch 4/100
40/40 - 2s - loss: 0.0020 - root_mean_squared_error: 0.0445
Epoch 5/100
40/40 - 2s - loss: 0.0015 - root_mean_squared_error: 0.0393
Epoch 6/100
40/40 - 2s - loss: 0.0017 - root_mean_squared_error: 0.0411
Epoch 7/100
40/40 - 2s - loss: 0.0012 - root_mean_squared_error: 0.0342
Epoch 8/100
40/40 - 2s - loss: 0.0011 - root_mean_squared_error: 0.0336
Epoch 9/100
40/40 - 2s - loss: 0.0010 - root_mean_squared_error: 0.0318
Epoch 10/100
40/40 - 2s - loss: 9.9480e-04 - root_mean_squared_error: 0.0315
Epoch 11/100
40/40 - 2s - loss: 9.6625e-04 - root_mean_squared_error: 0.0311
Epoch 12/100
40/40 - 2s - loss: 9.4086e-04 - root_mean_squared_error: 0.0307
Epoch 13/100
40/40 - 2s - loss: 8.7459e-04 - root_mean_squared_error: 0.0296
Epoch 14/100
40/40 - 2s - loss: 8.6056e-04 

In [10]:
# Print out the number of epochs used for this model
print(len(history.history['root_mean_squared_error']))

34


In [11]:
# Making Predictions to the test set
dataset_total = np.concatenate((train_data, test_data), axis = 0)
# Prepare our data to the test
inputs = dataset_total[len(dataset_total) - len(test_data) - 100:]
# Reshape our data to fit the upcoming for loop
inputs = inputs.reshape(-1,1)
inputs = sc.transform(inputs)

# Create lists to which we'll append x and y values for future test
X_test = []
y_test = []

# Create a loop that will run through groups of 200 timesteps for the test sample
for i in range(20, 1157):
    X_test.append(inputs[i-20:i,0])
    y_test.append(inputs[i, 0])

In [12]:
# Turn our lists into a numpy numbers array
X_test = np.array(X_test)
y_test = np.array(y_test)

# Reshape our array into 3 dimensions so it can fit the LSTM Model
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

y_test = y_test.reshape(-1, 1)

In [13]:
 model_loss, rmse = model.evaluate(
    X_test, y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Root Mean Square Error:{rmse}")

36/36 - 3s - loss: 3.0143e-04 - root_mean_squared_error: 0.0174
Normal Neural Network - Loss: 0.00030143227195367217, Root Mean Square Error:0.017361804842948914


In [14]:
predicted_volatility = model.predict(X_test)
predicted_volatility = sc.inverse_transform(predicted_volatility)

In [15]:
predicted_volatility

array([[1.6478556 ],
       [1.6499355 ],
       [1.6494689 ],
       ...,
       [0.96893394],
       [0.96913874],
       [0.9661962 ]], dtype=float32)

In [16]:
y_test = sc.inverse_transform(y_test)


In [17]:
import plotly.express as px

In [18]:
output = pd.DataFrame({
    'predicted_volatility' : [x[0] for x in predicted_volatility],
    'actual_volatility' : [x[0] for x in y_test]
})
output

Unnamed: 0,predicted_volatility,actual_volatility
0,1.647856,1.659447
1,1.649935,1.662316
2,1.649469,1.667671
3,1.648473,2.013691
4,1.676348,2.036940
...,...,...
1132,0.956192,0.959234
1133,0.964690,0.958767
1134,0.968934,0.953750
1135,0.969139,0.954752


In [19]:
px.line(output, y=["predicted_volatility", "actual_volatility"])

In [20]:
output.to_csv('../../Model Results/Nikkei Predicted Volatility.csv')
