# Deep Learning
## Assignment 1 - Beat the market
### Abel de Wit & Malin Hjärtström


In [1]:
# Getting the data (commented for local use)
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)
import tensorflow as tf
print(tf.__version__)

2.0.0


In [2]:
import pandas as pd
from keras.models import Sequential
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [3]:
info_data = pd.read_csv("data/info.txt", sep='\s+')
market_analysis = pd.read_csv("data/market_analysis.txt", sep='\s+')
market_segments = pd.read_csv("data/market_segments.txt", sep='\s+')
stock_prices = pd.read_csv("data/stock_prices.txt", sep='\s+')

# Do something with all our data so we can feed it to the NN
dataframe = info_data

# 'One hot encoding' the segments
dataframe["IT"] = dataframe['company'].apply(lambda x: 0 if x == 1 else 1)
dataframe["BIO"] = dataframe['company'].apply(lambda x: 1 if x == 1 else 0)
#dataframe["trend"] = market_analysis['trend']
dataframe["stock-price"] = stock_prices['stock-price']

# Setting the indexes as the date
dataframe.set_index(['year', 'day'], inplace=True)


# For now we dont use some data
# del dataframe['sentiment']
# del dataframe['m1']
# del dataframe['m2']
# del dataframe['m3']
# del dataframe['m4']

In [4]:
# Now we split into companies

company_0 = dataframe[dataframe['company'] == 0]
company_1 = dataframe[dataframe['company'] == 1]
company_2 = dataframe[dataframe['company'] == 2]

# Let's see how their stocks are doing

company_0.plot(y='stock-price').set_title('Company 0')
company_1.plot(y='stock-price').set_title('Company 1')
company_2.plot(y='stock-price').set_title('Company 2')

del company_0['company']
del company_0['quarter']
del company_1['company']
del company_1['quarter']
del company_2['company']
del company_2['quarter']

We want to predict wether the stock goes up or not, so we have to change the stock price values in such a way that it is binary.

`if stock-price-today - stock-price-yesterday > 0 then 1, else 0`

In [5]:
import numpy as np

company_0['stock-price-binary'] = np.where(company_0['stock-price'] > company_0['stock-price'].shift(), 1, 0)
del company_0['stock-price']

company_1['stock-price-binary'] = np.where(company_1['stock-price'] > company_1['stock-price'].shift(), 1, 0)
del company_1['stock-price']

company_2['stock-price-binary'] = np.where(company_2['stock-price'] > company_2['stock-price'].shift(), 1, 0)
del company_2['stock-price']

company_0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,Unnamed: 1_level_0,expert1,expert2,sentiment,m1,m2,m3,m4,IT,BIO,stock-price-binary
year,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017,3,0,0,10,6.3,1824,-1.0,0,1,0,0
2017,4,0,1,10,5.1,6912,-0.9,0,1,0,0
2017,5,0,1,10,6.6,8928,0.3,0,1,0,0
2017,6,0,1,10,7.8,6924,0.0,0,1,0,0
2017,9,0,1,10,-0.9,5635,0.9,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2019,175,1,0,4,7.8,4444,-0.9,0,1,0,0
2019,176,0,0,5,6.8,5901,-0.7,0,1,0,0
2019,177,0,0,4,8.1,1631,0.0,0,1,0,0
2019,178,0,0,5,4.3,352,-0.9,0,1,0,0


# Model definition

So now we have the data in a nice table, split into seperate companies, we can do some machine learning!

In [21]:
# Scale data
from sklearn.preprocessing import MinMaxScaler
import datetime, os
from keras.callbacks import TensorBoard
from keras.layers import Dense, Dropout, LSTM
from keras import regularizers
from sklearn.externals import joblib

def train_company(company, name):
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(company)
    scaled = pd.DataFrame(data=scaled, columns=company.columns)
    
    scaler_filename = "models/scaler " + name + ".save"
    joblib.dump(scaler, scaler_filename) 

    X = scaled.loc[:, scaled.columns != 'stock-price-binary']
    y = scaled['stock-price-binary']
    
    

    # Now we split the data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)
    
    # Create a model.
    model = Sequential()
    model.add(Dense(64, input_dim=9, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(8, input_dim=9, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    #model.summary()

    # Tensorboard stuff
    log_dir = os.path.join(
        "logs",
        name,
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = TensorBoard(log_dir, histogram_freq=1)

    # Train the model
    model.fit(x=X_train, 
              y=y_train, 
              epochs=100,
              batch_size = 50,
              shuffle=False,
              validation_data=(X_val, y_val),
              callbacks=[tensorboard_callback],
              verbose=0)
    return model, X_test, y_test

In [26]:
companies = [company_0, company_1, company_2]

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

models = []
i = 0

for company in companies:
    model_company, X_test, y_test = train_company(company, "Company " + str(i))
    i += 1
    models.append(model_company)
    y_pred = model_company.predict_classes(X_test)
    y_pred = y_pred[:, 0]
    
    
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_test, y_pred)
    print("-"*20)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(y_test, y_pred)
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(y_test, y_pred)
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_test, y_pred)
    print('F1 score: %f' % f1)
    print("-"*20)
    model_company.save('models/c' + str(i) + '.h5')
    

--------------------
Accuracy: 0.857143
Precision: 0.777778
Recall: 0.736842
F1 score: 0.756757
--------------------
--------------------
Accuracy: 0.809524
Precision: 0.809524
Recall: 0.680000
F1 score: 0.739130
--------------------
--------------------
Accuracy: 0.761905
Precision: 0.769231
Recall: 0.454545
F1 score: 0.571429
--------------------


# Combination of all 3

In [None]:
stock_prices = pd.DataFrame()

stock_prices['spb_0'] = company_0['stock-price-binary']
del company_0['stock-price-binary']

stock_prices['spb_1'] = company_1['stock-price-binary']
del company_1['stock-price-binary']

stock_prices['spb_2'] = company_2['stock-price-binary']
del company_2['stock-price-binary']

big_dataframe = pd.concat([company_0, company_1], axis=1) 
big_dataframe = pd.concat([big_dataframe, company_2], axis=1) 
big_dataframe = pd.concat([big_dataframe, stock_prices], axis=1)

In [None]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(big_dataframe)
scaled = pd.DataFrame(data=scaled, columns=big_dataframe.columns)

X = scaled.iloc[:, :-3]
y = scaled.iloc[:, -3:]

# Now we split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

num_features = X.shape[1]

# Create a model.
model = Sequential()
model.add(Dense(128, input_dim=num_features, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#model.summary()

# Tensorboard stuff
log_dir = os.path.join(
    "logs",
    "Company 1-3",
    datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir, histogram_freq=1)

# Train the model
model.fit(x=X_train, 
          y=y_train, 
          epochs=100,
          batch_size = 50,
          shuffle=False,
          validation_data=(X_val, y_val),
          callbacks=[tensorboard_callback],
          verbose=1)

model.save('models/c13.h5')

In [None]:
y_pred = model.predict(X_test)
print(X_test)

for r in range( y_pred.shape[0] ):
    for c in range( y_pred.shape[1] ):
        if y_pred[r,c] > 0.5:
            y_pred[r,c] = 1
        else:
            y_pred[r,c] = 0
    
print(y_pred)