In [2]:
import os
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential, model_from_json, Model
from keras.layers import LSTM, Dropout, Flatten, Dense, concatenate, Input
from keras.optimizers import adam
from sklearn.metrics import mean_squared_error
from keras.callbacks import ModelCheckpoint
from sklearn import preprocessing

Using TensorFlow backend.


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('/content/gdrive/My Drive/CFM/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


# Training phase:

### Reading and preprocessing training data:

In [0]:
x_trn = pd.read_csv("x_train.csv", index_col=0)
y_trn = pd.read_csv("y_train.csv", index_col=0)
#Preprocessing training data:
x_trn['y'] = y_trn.end_of_day_return
x_trn.sort_values(by=['date'],inplace=True)
x_trn.set_index(['date', 'eqt_code'], inplace=True)
x_trn.fillna(0.0,inplace=True)
y_trn = x_trn['y'].values
x_trn.drop(['y'],axis=1,inplace=True)
idx = x_trn.index

### Creating features per stock:

In [5]:
#Features per stock 
std    = pd.DataFrame(x_trn.sum(axis=1).std(level = 'eqt_code'),   columns=['std']).reindex(index=idx, level=1)
mean   = pd.DataFrame(x_trn.sum(axis=1).mean(level = 'eqt_code'),  columns = ['mean']).reindex(index=idx, level=1)
median = pd.DataFrame(x_trn.sum(axis=1).median(level = 'eqt_code'),columns = ['median']).reindex(index=idx, level=1)
#skew   = pd.DataFrame(x_trn.sum(axis=1).skew(level = 'eqt_code'),  columns=['skew']).reindex(index=idx, level=1)
#kurt   = pd.DataFrame(x_trn.sum(axis=1).kurt(level = 'eqt_code'),  columns=['kurt']).reindex(index=idx, level=1)
nx_train =    x_trn.join(mean,  how='inner')
nx_train = nx_train.join(std,   how='inner')
nx_train = nx_train.join(median,how='inner')
#nx_train = nx_train.join(skew,  how='inner')
#nx_train = nx_train.join(kurt,  how='inner')
nx_train['scaled'] = (nx_train.iloc[:,:-5].sum(axis=1).sub(nx_train['median'])).divide(nx_train['std'])
nx_train = nx_train[['mean','std','median','scaled']]
nx_train = nx_train.reindex(idx,copy=True)
X_st = nx_train.as_matrix()
del std, mean, median,nx_train; gc.collect()

97

### Normalizing data:

In [0]:
# #Normalization
# norms = (1/np.sqrt(x_trn.shape[1]))*x_trn.apply(lambda x: np.sqrt(x.dot(x)), axis=1)
# x_trn = x_trn.divide(norms,axis=0)
# y_trn = y_train/norms
# x_trn.head()

### Creating time series features:

In [6]:
#Times series features
#Rolling exp average
X_ts = x_trn.as_matrix()[...,None]
X_ts = np.concatenate((X_ts, x_trn.ewm(halflife=3.0,axis=1).mean().as_matrix()[...,None]),axis=-1)
print('Rolling exp average added')
#Rolling min
X_ts = np.concatenate((X_ts, x_trn.rolling(6,axis=1,min_periods=0).min().as_matrix()[...,None]),axis=-1)
print('Rolling min added')
#Rolling max
X_ts = np.concatenate((X_ts, x_trn.rolling(6,axis=1,min_periods=0).max().as_matrix()[...,None]),axis=-1)
print('Rolling max added')
#Rolling median
X_ts = np.concatenate((X_ts, x_trn.rolling(12,axis=1,min_periods=3).median().ffill(axis=1).bfill(axis=1).as_matrix()[...,None]),axis=-1)
print('Rolling median added')
#Rolling std
X_ts = np.concatenate((X_ts, x_trn.rolling(12,axis=1,min_periods=3).std().ffill(axis=1).bfill(axis=1).as_matrix()[...,None]),axis=-1)
print('Rolling std added')
# #Rolling skew
# X_ts = np.concatenate((X_ts, x_trn.rolling(12,axis=1,min_periods=3).skew().ffill(axis=1).bfill(axis=1).as_matrix()[...,None]),axis=-1)
# print('Rolling skew added')
# #Rolling kurt
# X_ts = np.concatenate((X_ts, x_trn.rolling(12,axis=1,min_periods=3).kurt().ffill(axis=1).bfill(axis=1).as_matrix()[...,None]),axis=-1)
# print('Rolling kurt added')

Rolling exp average added
Rolling min added
Rolling max added
Rolling median added
Rolling std added


### Saving features of training data:

In [0]:
np.save('X_series_features.npy',X_ts,allow_pickle=False)
np.save('X_stocks_features.npy',X_st,allow_pickle=False)
np.save('Y.npy',y_trn,allow_pickle=False)

In [0]:
# run only if environment disconnects on google colab
if 'X_ts' not in globals():
  X_ts = np.load('X_series_features.npy',allow_pickle=False)
if 'X_st' not in globals():
  X_st = np.load('X_stocks_features.npy',allow_pickle=False)
if 'y_trn' not in globals():
  y_trn = np.load('Y.npy',allow_pickle=False)

#### Custom loss metrics and accuracy for keras backend:

In [0]:
from keras import backend as K
def sign_acc(y_true, y_pred):
    _c = y_pred * y_true
    _c[_c<0] = 0
    _c[_c>0] = 1
    return np.mean(_c)
  
def K_acc(y_true, y_pred):
    return K.mean(K.greater(y_pred*y_true, 0), axis=-1)
  
def K_loss(y_true, y_pred):
  return K.mean(K.log(1. + K.exp(- y_true * y_pred)), axis=-1)

def M_loss(y_pred, y_true):
  return K.mean(K.maximum(0.3-y_pred*y_true,0.), axis=-1)

def CFM_metric(y_true, y_pred):
  return ((y_true > 0) == (y_pred > 0.5)).mean()

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

### Training and validation split:

In [6]:
with_val= False
if with_val:
  #training-validation split
  n = int(X_ts.shape[0]*.80)
  X_ts_trn, X_ts_val = X_ts[:n], X_ts[n:]
  X_st_trn, X_st_val = X_st[:n], X_st[n:]
  Y_trn, Y_val = y_trn[:n], y_trn[n:]

  del X_ts, X_st,x_trn, y_trn; gc.collect()
else:
  del x_trn;gc.collect()

440

### Building the model: LSTM on time series features added features by stock

In [0]:
# train the model
def build_model(X_ts_trn, X_st_trn, Y_trn, model_name, with_val = False, val_data=None):
  #, X_ts_val, X_st_val,Y_val,
	# define parameters
  verbose, epochs, batch_size = 0, 30, 2000
  # define model
  features_ts = Input(shape=(X_ts_trn.shape[1], X_ts_trn.shape[2]), name='features_ts')
  features_st = Input(shape=(X_st_trn.shape[1],), name='features_st')
  lstm = Sequential()
  lstm.add(LSTM(144,input_shape=(X_ts_trn.shape[1], X_ts_trn.shape[2])))
  lstm.add(Dropout(rate=0.2))
  lstm.add(Dense(24))
  lstm_output = lstm(features_ts)
  merged = concatenate([lstm_output,features_st],axis=1)
  layer1 = Dense(15, activation='relu')(merged)
  layer2 = Dropout(rate=0.2)(layer1)
  layer3 = Dense(15, activation='relu')(layer2)
  final  = Dense(1)(layer3)
  model = Model(inputs=[features_ts, features_st], outputs=[final])
  model.compile(loss= 'mae', optimizer='adam',metrics=[K_acc])
  if with_val:
    checkpointer = ModelCheckpoint(filepath='models/'+model_name+'_best_w.hdf5', monitor='val_K_acc' , verbose=1, save_best_only=True)
    # fit network
    hist = model.fit([X_ts_trn,X_st_trn], Y_trn, epochs=epochs, batch_size=batch_size, 
                   validation_data=val_data,
                   callbacks=[checkpointer])
  else:
    checkpointer = ModelCheckpoint(filepath='models/'+model_name+'_best_w.hdf5', monitor='K_acc' , verbose=1, save_best_only=True)
    # fit network
    hist = model.fit([X_ts_trn,X_st_trn], Y_trn, epochs=epochs, batch_size=batch_size, 
                   callbacks=[checkpointer])
  return model, hist

### Training the model:

In [23]:
model_name = 'LSTM_mae_all'
if with_val : 
  #train on (train, val) split of training data
  model, hist = build_model(X_ts_trn, X_st_trn, Y_trn, model_name, True, ([X_ts_val, X_st_val], Y_val))
else:
  #train on whole data
  model, hist = build_model(X_ts, X_st, y_trn, model_name)


Epoch 1/30

Epoch 00001: K_acc improved from -inf to 0.50261, saving model to models/features_w_mae_all.hdf5
Epoch 2/30

Epoch 00002: K_acc improved from 0.50261 to 0.51081, saving model to models/features_w_mae_all.hdf5
Epoch 3/30

Epoch 00003: K_acc improved from 0.51081 to 0.51564, saving model to models/features_w_mae_all.hdf5
Epoch 4/30

Epoch 00004: K_acc improved from 0.51564 to 0.51807, saving model to models/features_w_mae_all.hdf5
Epoch 5/30

Epoch 00005: K_acc improved from 0.51807 to 0.52012, saving model to models/features_w_mae_all.hdf5
Epoch 6/30

Epoch 00006: K_acc improved from 0.52012 to 0.52077, saving model to models/features_w_mae_all.hdf5
Epoch 7/30

Epoch 00007: K_acc did not improve from 0.52077
Epoch 8/30

Epoch 00008: K_acc did not improve from 0.52077
Epoch 9/30

Epoch 00009: K_acc improved from 0.52077 to 0.52088, saving model to models/features_w_mae_all.hdf5
Epoch 10/30

Epoch 00010: K_acc improved from 0.52088 to 0.52131, saving model to models/features_w

### Saving the model and its weights:

In [24]:
# serialize model to JSON
model_json = model.to_json()
with open('models/'+model_name+'.json', 'w') as json_file:
  json_file.write(model_json)
# serialize weights to HDF5
model.save_weights('models/'+model_name+'_last_w.hdf5')
print('Saved model'+model_name'+ to disk')

Saved model LSTM to disk


### Plot of training and validation loss

In [0]:
if with_val:
fig,ax=plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
  ax[0].plot(hist.history['loss'])
  ax[0].set_xlabel('epoch')
  ax[0].set_ylabel('loss')
  ax[0].set_title('model train loss')
  ax[0].legend(['train'], loc='upper left')

  ax[1].plot(hist.history['val_loss'])
  ax[1].set_xlabel('epoch')
  ax[1].set_ylabel('val_loss')
  ax[1].set_title('model validation loss')
  ax[1].legend(['val'], loc='upper left')
  fig.suptitle(model_name)
  plt.show()
  fig.savefig(model_name+'.jpg')

# Test phase:

### Deleting training data from memory:

In [25]:
del X_ts,X_st; gc.collect()

439

### Reading test data and creating features:

In [0]:
x_tst = pd.read_csv("x_test.csv",  index_col=0)
#Preprocessing test data:
x_tst['index'] = x_tst.index
idx_before = x_tst.index
x_tst.sort_values(by=['date'],inplace=True)
x_tst.set_index(['date', 'eqt_code'], inplace=True)
x_tst.fillna(0.0,inplace=True)
idx_tst = x_tst['index'].values
x_tst.drop(['index'],axis=1,inplace=True)
idx = x_tst.index

In [27]:
#Features per stock 
std    = pd.DataFrame(x_tst.sum(axis=1).std(level = 'eqt_code'),   columns=['std']).reindex(index=idx, level=1)
mean   = pd.DataFrame(x_tst.sum(axis=1).mean(level = 'eqt_code'),  columns = ['mean']).reindex(index=idx, level=1)
median = pd.DataFrame(x_tst.sum(axis=1).median(level = 'eqt_code'),columns = ['median']).reindex(index=idx, level=1)
#skew   = pd.DataFrame(x_trn.sum(axis=1).skew(level = 'eqt_code'),  columns=['skew']).reindex(index=idx, level=1)
#kurt   = pd.DataFrame(x_trn.sum(axis=1).kurt(level = 'eqt_code'),  columns=['kurt']).reindex(index=idx, level=1)
nx_tst =  x_tst.join(mean,  how='inner')
nx_tst = nx_tst.join(std,   how='inner')
nx_tst = nx_tst.join(median,how='inner')
#nx_train = nx_train.join(skew,  how='inner')
#nx_train = nx_train.join(kurt,  how='inner')
nx_tst['scaled'] = (nx_tst.iloc[:,:-5].sum(axis=1).sub(nx_tst['median'])).divide(nx_tst['std'])
nx_tst = nx_tst[['mean','std','median','scaled']]
nx_tst = nx_tst.reindex(idx,copy=True)
X_st_tst = nx_tst.as_matrix()
del std, mean, median,nx_tst; gc.collect()

97

In [28]:
#Times series features
#Rolling exp average
X_ts_tst = x_tst.as_matrix()[...,None]
X_ts_tst = np.concatenate((X_ts_tst, x_tst.ewm(halflife=3.0,axis=1).mean().as_matrix()[...,None]),axis=-1)
print('Rolling exp average added')
#Rolling min
X_ts_tst = np.concatenate((X_ts_tst, x_tst.rolling(6,axis=1,min_periods=0).min().as_matrix()[...,None]),axis=-1)
print('Rolling min added')
#Rolling max
X_ts_tst = np.concatenate((X_ts_tst, x_tst.rolling(6,axis=1,min_periods=0).max().as_matrix()[...,None]),axis=-1)
print('Rolling max added')
#Rolling median
X_ts_tst = np.concatenate((X_ts_tst, x_tst.rolling(12,axis=1,min_periods=3).median().ffill(axis=1).bfill(axis=1).as_matrix()[...,None]),axis=-1)
print('Rolling median added')
#Rolling std
X_ts_tst = np.concatenate((X_ts_tst, x_tst.rolling(12,axis=1,min_periods=3).std().ffill(axis=1).bfill(axis=1).as_matrix()[...,None]),axis=-1)
print('Rolling std added')
# #Rolling skew
# X_ts = np.concatenate((X_ts, x_trn.rolling(12,axis=1,min_periods=3).skew().ffill(axis=1).bfill(axis=1).as_matrix()[...,None]),axis=-1)
# print('Rolling skew added')
# #Rolling kurt
# X_ts = np.concatenate((X_ts, x_trn.rolling(12,axis=1,min_periods=3).kurt().ffill(axis=1).bfill(axis=1).as_matrix()[...,None]),axis=-1)
# print('Rolling kurt added')

Rolling exp average added
Rolling min added
Rolling max added
Rolling median added
Rolling std added


### Saving features of test data:

In [0]:
np.save('X_series_features_test.npy',X_ts_tst,allow_pickle=False)
np.save('X_stocks_features_test.npy',X_st_tst,allow_pickle=False)

### Loading the saved model if necessary (if google colab disconnects):

In [0]:
model_to_load = 'LSTM_mae_all'
# load json and create model
with open('models/'+model_to_load+'.json', 'r') as json_file:
  loaded_model_json = json_file.read()
model = model_from_json(loaded_model_json)
# load weights into new model
# best or last weights:
w_type = 'best'
model.load_weights('models/'+model_to_load+'_'+w_type+'_w.hdf5')
print("Loaded model from disk")

### Running the model on test data:

In [30]:
y_hat = model.predict([X_ts_tst, X_st_tst],batch_size=200,verbose=1)



### Reordering and saving data:

In [0]:
y_pred = pd.DataFrame(y_hat[:,0],columns=['end_of_day_return'])
y_pred.index = idx_tst
y_pred = y_pred.reindex(idx_before, copy=True)
y_pred.to_csv(model_name+'.csv')