# Deep Learning
## Assignment 1 - Beat the market
### Abel de Wit & Malin Hjärtström


In [1]:
# Getting the data (commented for local use)
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)
import tensorflow as tf
print(tf.__version__)

2.0.0


In [2]:
import pandas as pd
from keras.models import Sequential
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [3]:
info_data = pd.read_csv("data/info.txt", sep='\s+')
market_analysis = pd.read_csv("data/market_analysis.txt", sep='\s+')
market_segments = pd.read_csv("data/market_segments.txt", sep='\s+')
stock_prices = pd.read_csv("data/stock_prices.txt", sep='\s+')

# Do something with all our data so we can feed it to the NN
dataframe = info_data

# 'One hot encoding' the segments
dataframe["IT"] = dataframe['company'].apply(lambda x: 0 if x == 1 else 1)
dataframe["BIO"] = dataframe['company'].apply(lambda x: 1 if x == 1 else 0)
#dataframe["trend"] = market_analysis['trend']
dataframe["stock-price"] = stock_prices['stock-price']

# Setting the indexes as the date
dataframe.set_index(['year', 'day'], inplace=True)


# For now we dont use some data
# del dataframe['sentiment']
# del dataframe['m1']
# del dataframe['m2']
# del dataframe['m3']
# del dataframe['m4']
# del dataframe['IT']
# del dataframe['BIO']


dataframe.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,company,quarter,expert1,expert2,sentiment,m1,m2,m3,m4,IT,BIO,stock-price
year,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017,3,0,0,0,0,10,6.3,1824,-1.0,0,1,0,102.2
2017,4,0,0,0,1,10,5.1,6912,-0.9,0,1,0,102.2
2017,5,0,0,0,1,10,6.6,8928,0.3,0,1,0,102.2
2017,6,0,0,0,1,10,7.8,6924,0.0,0,1,0,102.2
2017,9,0,0,0,1,10,-0.9,5635,0.9,0,1,0,102.2
2017,10,0,0,1,1,10,-8.0,9044,-0.1,0,1,0,104.3
2017,11,0,0,1,1,10,-4.2,4741,-0.8,0,1,0,106.8
2017,12,0,0,1,0,9,-1.0,3432,-0.2,0,1,0,106.8
2017,13,0,0,1,1,9,-3.8,2677,0.4,0,1,0,108.9
2017,17,0,0,1,1,10,9.8,916,-0.3,0,1,0,108.9


In [4]:
# Now we split into companies

company_0 = dataframe[dataframe['company'] == 0]
company_1 = dataframe[dataframe['company'] == 1]
company_2 = dataframe[dataframe['company'] == 2]

# Let's see how their stocks are doing

company_0.plot(y='stock-price').set_title('Company 0')
company_1.plot(y='stock-price').set_title('Company 1')
company_2.plot(y='stock-price').set_title('Company 2')


del company_0['company']
del company_0['quarter']
del company_1['company']
del company_1['quarter']
del company_2['company']
del company_2['quarter']

company_0 = company_1

We want to predict wether the stock goes up or not, so we have to change the stock price values in such a way that it is binary.

`if stock-price-today - stock-price-yesterday > 0 then 1, else 0`

In [5]:
import numpy as np
company_0['stock-price-binary'] = np.where(company_0['stock-price'] > company_0['stock-price'].shift(), 1, 0)
del company_0['stock-price']
company_0.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 1_level_0,expert1,expert2,sentiment,m1,m2,m3,m4,IT,BIO,stock-price-binary
year,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017,3,0,0,5,-0.1,482,0.7,0,0,1,0
2017,4,0,0,5,8.6,6262,0.6,0,0,1,0
2017,5,0,1,4,-6.1,6987,-0.1,0,0,1,1
2017,6,1,1,4,-4.9,2262,-0.1,0,0,1,1
2017,9,0,0,5,6.7,9254,-0.5,0,0,1,0
2017,...,...,...,...,...,...,...,...,...,...,...
2017,138,0,0,0,-9.1,6397,-0.2,0,0,1,1
2017,139,0,0,1,2.2,7144,0.2,0,0,1,0
2017,142,0,0,0,7.6,7898,-0.7,0,0,1,0
2017,143,0,0,0,6.1,1485,0.1,0,0,1,0


# Company 0

So now we have the data in a nice table, split into seperate companies, we can do some machine learning!

In [6]:
# Scale data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaled_0 = scaler.fit_transform(company_0)
scaled_0 = pd.DataFrame(data=scaled_0, columns=company_0.columns)

X = scaled_0.loc[:, scaled_0.columns != 'stock-price-binary']
y = scaled_0['stock-price-binary']

print(X.shape)
num_features = X.shape[1]
print(y.shape)

# Now we split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

(626, 9)
(626,)


In [9]:
import math
LX_train = scaled_0[:math.floor(0.8*scaled_0.shape[0])]
LX_test = scaled_0[math.floor(0.8*scaled_0.shape[0]):]

print(LX_train.shape)
print(LX_test.shape)

LX_train = LX_train.values
LX_test = LX_test.values

BATCH_SIZE = 2 # Update weights after each day
TIME_STEPS = 5 # Look back 30 days...
features = num_features

def build_timeseries(mat, y_col_index):
    # y_col_index is the index of column that would act as output column
    # total number of time-series samples would be len(mat) - TIME_STEPS
    dim_0 = mat.shape[0] - TIME_STEPS
    dim_1 = mat.shape[1]
    x = np.zeros((dim_0, TIME_STEPS, dim_1))
    y = np.zeros((dim_0,))
    
    for i in range(dim_0):
        x[i] = mat[i:TIME_STEPS+i]
        y[i] = mat[TIME_STEPS+i, y_col_index]
    print("length of time-series i/o",x.shape,y.shape)
    return x, y

def trim_dataset(mat, batch_size):
    """
    trims dataset to a size that's divisible by BATCH_SIZE
    """
    no_of_rows_drop = mat.shape[0]%batch_size
    if(no_of_rows_drop > 0):
        return mat[:-no_of_rows_drop]
    else:
        return mat
    
x_t, y_t = build_timeseries(LX_train, LX_train.shape[1] - 1)
x_t = trim_dataset(x_t, BATCH_SIZE)
y_t = trim_dataset(y_t, BATCH_SIZE)
x_temp, y_temp = build_timeseries(LX_test, LX_train.shape[1] - 1)
x_val, x_test_t = np.split(trim_dataset(x_temp, BATCH_SIZE),2)
y_val, y_test_t = np.split(trim_dataset(y_temp, BATCH_SIZE),2)

######################

import datetime, os
from keras.callbacks import TensorBoard
from keras.layers import Dense, Dropout, LSTM
from keras import metrics
from keras import regularizers
from keras import optimizers

# Create a model.
model = Sequential()

model.add(LSTM(128,  
          batch_input_shape=(BATCH_SIZE, TIME_STEPS, x_t.shape[2])))
model.add(Dense(8,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
optimizer = optimizers.Adam(lr=0.5)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

model.summary()

# Train the model
log_dir = os.path.join(
    "logs",
    "fit",
    datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir, histogram_freq=1)

samples = 20
steps = math.floor(x_t.shape[0] / samples)
print(steps)
    

model.fit(  x=x_t, 
            y=y_t, 
            epochs=5,
            batch_size=BATCH_SIZE,
            shuffle=False,
            validation_data=(x_val, y_val),
            callbacks=[tensorboard_callback],
            verbose=1)

(500, 10)
(126, 10)
length of time-series i/o (495, 5, 10) (495,)
length of time-series i/o (121, 5, 10) (121,)
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (2, 128)                  71168     
_________________________________________________________________
dense_3 (Dense)              (2, 8)                    1032      
_________________________________________________________________
dense_4 (Dense)              (2, 1)                    9         
Total params: 72,209
Trainable params: 72,209
Non-trainable params: 0
_________________________________________________________________
24
Train on 494 samples, validate on 60 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1418eda10>

In [8]:
%load_ext tensorboard
%tensorboard --logdir logs

Reusing TensorBoard on port 6007 (pid 9736), started 16:41:01 ago. (Use '!kill 9736' to kill it.)