In [17]:
# Imports and pre-defined functions
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, BatchNormalization
import numpy as np
import pandas as pd

def readTrain():
    train = pd.read_csv("../train_data.csv")
    data = train[['MidPrice', 'BidPrice1', 'BidVolume1', 'AskPrice1', 'AskVolume1']]
    data['BidVolume1'] -= data['AskVolume1']
    data = data.drop(columns=['AskVolume1'])
    return data

def readPredict():
    test = pd.read_csv("../test_data.csv")
    data = test[['MidPrice', 'BidPrice1', 'BidVolume1', 'AskPrice1', 'AskVolume1']]
    data['BidVolume1'] -= data['AskVolume1']
    data = data.drop(columns=['AskVolume1'])
    return data

def normalize(train):
    train_norm = train.apply(lambda x: (x - np.mean(x)) / (np.std(x)))
    return train_norm

def build_data(train):
    training = np.array(train)
    X_train = []
    y_train = []
    
    for i in range(train.shape[0] - 35):
        if not (i % 10000):
            print("BUILDING ITERATIONS: ", i)
        X_train.append(np.array(list(np.array((training[i+9])))))# + list(np.array(training[i+30:i+33])))[0])
        y_train.append(np.sum(np.array(train['MidPrice'][i+10:i+30]))/20)
    
    return np.array(X_train), np.array(y_train)

def shuffle(X,Y):
    np.random.seed(10)
    randomList = np.arange(X.shape[0])
    np.random.shuffle(randomList)
    return X[randomList], Y[randomList]

def splitData(X,Y,rate):
    X_train = X[int(X.shape[0]*rate):]
    Y_train = Y[int(Y.shape[0]*rate):]
    X_val = X[:int(X.shape[0]*rate)]
    Y_val = Y[:int(Y.shape[0]*rate)]
    return X_train, Y_train, X_val, Y_val

In [28]:
# read csv
train = readTrain()
print(train)

# Normalization
train_norm = normalize(train)
X_train, Y_train = build_data(train_norm)

# shuffle the data, and random seed is 10
X_train, Y_train = shuffle(X_train, Y_train)

# split training data and validation data
X_train, Y_train, X_val, Y_val = splitData(X_train, Y_train, 0.35)
print("X: ", X_train, "Y", Y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


        MidPrice  BidPrice1  BidVolume1  AskPrice1
0         3.7865      3.786      8600.0      3.787
1         3.7835      3.783    -75000.0      3.784
2         3.7835      3.783    -18400.0      3.784
3         3.7845      3.784    129000.0      3.785
4         3.7835      3.783    147600.0      3.784
5         3.7835      3.783    281900.0      3.784
6         3.7860      3.784    409200.0      3.788
7         3.7860      3.784     47300.0      3.788
8         3.7860      3.784    -69400.0      3.788
9         3.7855      3.784    -71300.0      3.787
10        3.7860      3.784    -54100.0      3.788
11        3.7870      3.786    -83900.0      3.788
12        3.7875      3.787     -5200.0      3.788
13        3.7855      3.785     22400.0      3.786
14        3.7855      3.785     22500.0      3.786
15        3.7845      3.784    304800.0      3.785
16        3.7845      3.784    291200.0      3.785
17        3.7835      3.783    340600.0      3.784
18        3.7840      3.783    

In [41]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
model = Sequential()
# model.add(LSTM(50, input_length=X_train.shape[1], input_dim=X_train.shape[2], return_sequences=False))
model.add(Dense(500, input_shape=(X_train.shape[1],)))
model.add(Dense(300))
# output shape: (1, 1)
model.add(Dense(100, activation='relu'))
model.add(Dense(30))#, activation='relu'))
model.add(Dense(1))
from keras import optimizers
opt = optimizers.adam(lr=0.01, beta_1=0.99, beta_2=0.999, epsilon=None, decay=0.99999, amsgrad=False)
model.compile(loss="mse", optimizer="adam")
model.summary()

callback = EarlyStopping(monitor="loss", patience=10, verbose=1, mode="auto")
model.fit(X_train, Y_train, epochs=10, batch_size=200, validation_data=(X_val, Y_val), callbacks=[callback])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_45 (Dense)             (None, 500)               2500      
_________________________________________________________________
dense_46 (Dense)             (None, 300)               150300    
_________________________________________________________________
dense_47 (Dense)             (None, 100)               30100     
_________________________________________________________________
dense_48 (Dense)             (None, 30)                3030      
_________________________________________________________________
dense_49 (Dense)             (None, 1)                 31        
Total params: 185,961
Trainable params: 185,961
Non-trainable params: 0
_________________________________________________________________
Train on 322503 samples, validate on 107501 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8

<keras.callbacks.History at 0x128dda6a0>

In [38]:
# read csv
predict = readPredict()

# Normalization
predict_norm = np.array(normalize(predict))

X_predict = []    
for i in range(0, predict_norm.shape[0], 10):
    if not (i % 10000):
        print("BUILDING ITERATIONS: ", i)
    X_predict.append(np.array(predict_norm[i+9]))

print(X_predict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


BUILDING ITERATIONS:  0
[array([ 1.99704767,  1.9871583 , -0.16378104,  2.00692451]), array([ 2.01837712,  2.01915518, -0.65549565,  2.01758828]), array([2.0077124 , 2.00848955, 0.76730846, 2.00692451]), array([ 1.96505349,  1.96582705, -0.31953291,  1.96426943]), array([ 1.86907097,  1.86983644, -0.01018359,  1.86829551]), array([1.90106514, 1.90183331, 0.01490906, 1.90028682]), array([ 1.87973569,  1.88050206, -0.02311011,  1.87895928]), array([ 1.95438877,  1.95516143, -0.08457443,  1.95360567]), array([1.96505349e+00, 1.96582705e+00, 1.47562250e-03, 1.96426943e+00]), array([1.96505349e+00, 1.96582705e+00, 2.08316797e-04, 1.96426943e+00]), array([1.93305932, 1.94449581, 0.00274293, 1.92161436]), array([ 1.95438877,  1.95516143, -0.03324855,  1.95360567]), array([1.95438877, 1.95516143, 0.02276636, 1.95360567]), array([ 1.97571822,  1.97649268, -0.09002385,  1.9749332 ]), array([1.96505349, 1.96582705, 0.6352552 , 1.96426943]), array([1.93305932e+00, 1.93383018e+00, 1.09543079e-03, 1

In [39]:
y_pred = model.predict(np.array(X_predict))
print(np.mean(predict['MidPrice']))
y_pred *= (np.std(predict['MidPrice'], axis=0)) 
y_pred += (np.mean(predict['MidPrice'], axis=0))
print(y_pred)

3.2422427
[[3.429388 ]
 [3.4313214]
 [3.4306684]
 [3.4264002]
 [3.41747  ]
 [3.4204779]
 [3.4184675]
 [3.425457 ]
 [3.4264786]
 [3.4264784]
 [3.4235291]
 [3.4254694]
 [3.4254832]
 [3.4274569]
 [3.4266336]
 [3.4234765]
 [3.418878 ]
 [3.4166121]
 [3.4138823]
 [3.4093456]
 [3.408499 ]
 [3.4032273]
 [3.3985302]
 [3.3954234]
 [3.3983717]
 [3.400344 ]
 [3.3991632]
 [3.4005666]
 [3.4008403]
 [3.4015174]
 [3.3997383]
 [3.3996744]
 [3.400403 ]
 [3.3985066]
 [3.3971128]
 [3.3963373]
 [3.4004645]
 [3.399184 ]
 [3.3981764]
 [3.3975046]
 [3.3923013]
 [3.387474 ]
 [3.3895197]
 [3.39046  ]
 [3.390468 ]
 [3.3894715]
 [3.3883457]
 [3.386579 ]
 [3.3824325]
 [3.38241  ]
 [3.3845088]
 [3.3834562]
 [3.3842263]
 [3.3853106]
 [3.3851664]
 [3.384755 ]
 [3.3815975]
 [3.3794181]
 [3.3765347]
 [3.3762898]
 [3.3781364]
 [3.3784292]
 [3.3785233]
 [3.3843486]
 [3.382376 ]
 [3.3894734]
 [3.3847094]
 [3.3858736]
 [3.3852775]
 [3.384129 ]
 [3.3841603]
 [3.3775024]
 [3.3774495]
 [3.377298 ]
 [3.3784518]
 [3.3784518]
 [

In [40]:
import csv
def predict_writeout(predict):
    # write csv
    with open("../predict.csv", "w", newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(
            ["Id", "MidPrice"])
        for i in range(0, len(predict)):
            writer.writerow([(1 + i), predict[i][0]])

predict_writeout(y_pred)