In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing
import tensorflow as tf
import timeit

In [2]:
df = pd.read_csv("processed_data_SP100.csv", index_col = 1)
df.insert(0, 'Change_Indicator', np.where(df['ROCP_Close'] >= 0, 1, 0))

In [3]:
tickers_all = df['Ticker'].unique()
print(tickers_all)
print(len(tickers_all))

['goog' 'jnj' 'pep' 'abbv' 'ibm' 'lmt' 'slb' 'gs' 'cvs' 'eog' 'chtr' 'tjx'
 'agn' 'pnc' 'atvi' 'mdlz' 'intu' 'rtn' 'cl' 'psx' 'spg' 'bsx' 'vlo' 'nsc'
 'hca' 'tgt' 'zts' 'so' 'stz' 'wm' 'bbt' 'apd' 'rost' 'aep' 'adsk' 'stt'
 'cxo' 'pxd' 'yum' 'adm' 'amd' 'rcl' 'dfs' 'zbh' 'gis' 'cmi' 'ph' 'bf.b'
 'eqr' 'xel' 'k' 'ed' 'syf' 'kr' 'wy' 'rok' 'ntap' 'key' 'hrl' 'ctas'
 'mro' 'dte' 'azo' 'fitb' 'ca' 'fe' 'mkc' 'fast' 'ess' 'dhi' 'ctxs' 'tpr'
 'fti' 'cms' 'stx' 'cf' 'mos' 'expd' 'cinf' 'alb' 'mas' 'cboe' 'm' 'var'
 'tmk' 'irm' 'qrvo' 'uaa' 'wu' 'flir' 'slg' 'xray' 'alk' 'xrx' 'evhc' 'fl'
 'hrb' 'srcl']
98


In [4]:
# pick several stocks for test
tickers = list(tickers_all[70:])
df = df[df['Ticker'].isin(tickers)]

In [5]:
tickers = df['Ticker'].unique()

In [6]:
valid_set_size_percentage = 20
test_set_size_percentage = 10

seq_len = 20 
window_len = 5

n_steps = seq_len - 1
n_inputs = df.shape[1] - 1
n_outputs = 2

n_epochs = 250
patience = 10
min_delta = 0.01

In [7]:
def normalize_data(df):
    min_max_scaler = sklearn.preprocessing.MinMaxScaler()
    for i in df.columns:
        df[i] = min_max_scaler.fit_transform(df[i].values.reshape(-1,1))
    return df

In [8]:
def load_data(stock, seq_len, window_len):
    data_raw = stock.values; # convert to numpy array
    data = []
    
    # create all possible sequences of length seq_len
    for index in range((len(data_raw) - seq_len)//window_len): 
        data.append(data_raw[index * window_len: index * window_len + seq_len])
    
    data = np.array(data);
    valid_set_size = int(np.round(valid_set_size_percentage/100 * data.shape[0]));  
    test_set_size = int(np.round(test_set_size_percentage/100 * data.shape[0]));
    train_set_size = data.shape[0] - (valid_set_size + test_set_size);
    
    x_train = data[:train_set_size, :-1, :]
    y_train = data[:train_set_size, -1, 0:1]
    
    
    x_valid = data[train_set_size:train_set_size + valid_set_size, :-1, :]
    y_valid = data[train_set_size:train_set_size + valid_set_size, -1, 0:1]
    
    x_test = data[train_set_size+valid_set_size:, :-1, :]
    y_test = data[train_set_size+valid_set_size:, -1, 0:1]
    point = train_set_size + valid_set_size + 1
    
    return [x_train, y_train, x_valid, y_valid, x_test, y_test, point]

In [9]:
def one_hot_encode(labels, K = 2):
    N = len(labels)
    Z = np.zeros((N, K))
    int_list = [int(p) for p in labels.flatten()]
    Z[np.arange(N), int_list] = 1
    return Z

In [10]:
def get_next_batch(batch_size):
    global index_in_epoch, x_train, perm_array   
    start = index_in_epoch
    index_in_epoch += batch_size
    
    if index_in_epoch > x_train.shape[0]:
        np.random.shuffle(perm_array) # shuffle permutation array
        start = 0 # start next epoch

        index_in_epoch = batch_size
        
    end = index_in_epoch
    return x_train[perm_array[start:end]], y_train[perm_array[start:end]]

In [11]:
models = {
    'replication_attention_lstm': {
        'dropout': .2,
        'batch_size': 512,
        'neurons': 200,
        'outputs': 2,
        'batch_normalization': True,
        'activation': tf.nn.tanh, #use tanh as the paper did
        'optimizer': tf.train.AdamOptimizer(learning_rate = .001),
        'loss': tf.losses.softmax_cross_entropy,  # use cross_entropy for classification
        'layers': [{ 'layer': 'BasicLSTM', 'attention': True, 'attention_length': 40},
                    { 'layer': 'BasicLSTM', 'attention': False}]
    },
    'replication_lstm': {
        'dropout': .2,
        'batch_size': 512,
        'neurons': 200,
        'outputs': 2,
        'batch_normalization': True,
        'activation': tf.nn.tanh,
        'optimizer': tf.train.AdamOptimizer(learning_rate = .001),
        'loss': tf.losses.softmax_cross_entropy,
        'layers': [{ 'layer': 'BasicLSTM', 'attention': False},
                    { 'layer': 'BasicLSTM', 'attention': False}]
    }
}

In [12]:
model_results = pd.DataFrame(columns=['name', 'ticker', 'accuracy', 'total_change', 'predicted_investment', 'early_stop', 'time_in_seconds'])
model_results.set_index(['name', 'ticker'], inplace=True)

In [13]:
for name, model in models.items():
    for ticker in tickers:
        start = timeit.default_timer()
        print("Starting " + str(ticker))
        df_stock = df[df.Ticker == ticker][35:].copy()
        df_stock = df_stock.replace([np.inf, -np.inf], np.nan)
        df_stock = df_stock.dropna()
        df_stock = df_stock[df_stock['Volume'] > 0]
        df_stock.drop(['Ticker'], 1, inplace = True)
        
        if len(df_stock) < 252:
            print("Less than one year of trading data for " + str(ticker) + ", skipping it.")
            continue

        cols = list(df_stock.columns.values)

        df_stock_norm = df_stock.copy()
        df_stock_norm = normalize_data(df_stock_norm)

        x_train, y_train, x_valid, y_valid, x_test, y_test, returns_point = load_data(df_stock_norm, seq_len, window_len)
        index_in_epoch = 0
        perm_array  = np.arange(x_train.shape[0])
        np.random.shuffle(perm_array)

        y_train = one_hot_encode(y_train)
        y_valid = one_hot_encode(y_valid)
        y_test = one_hot_encode(y_test)

        tf.reset_default_graph()

        X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
        y = tf.placeholder(tf.float32, [None, n_outputs])

        is_training = tf.placeholder(tf.bool)

        layers = []
        for l in model['layers']:     
            if l['layer'] == 'BasicLSTM':
                cell = tf.contrib.rnn.LSTMCell(num_units = model['neurons'], activation = model['activation'], name='basic_lstm_cell')

            if l['attention'] == True:
                cell = tf.contrib.rnn.AttentionCellWrapper(cell, attn_length = l['attention_length'], state_is_tuple = True)

            cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob= 1 - model['dropout'])
            layers.append(cell)

        multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
        rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)

        stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, model['neurons']]) 
        stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs)
        ## Batch Normalization
        if model['batch_normalization']:
            stacked_outputs = tf.layers.batch_normalization(stacked_outputs, training = is_training)

        outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])
        outputs = tf.nn.softmax(outputs)
        outputs = outputs[:, n_steps - 1, :] # keep only last output of sequence
        
        
        loss = model['loss'](y, outputs)# loss function = mean squared error 
        optimizer = model['optimizer']
        training_op = optimizer.minimize(loss)

        train_set_size = x_train.shape[0]
        test_set_size = x_test.shape[0]

        loss_list = list()
        sess_list = list()
        batch_size = model['batch_size']
        early_stop_time = -1
        with tf.Session() as sess: 
            sess.run(tf.global_variables_initializer())
            for iteration in range(int(n_epochs * train_set_size/batch_size)):
                x_batch, y_batch = get_next_batch(batch_size) # fetch the next training batch 

                sess.run(training_op, feed_dict={X: x_batch, y: y_batch, is_training: True}) 

                mse_valid = loss.eval(feed_dict={X: x_valid, y: y_valid,is_training: False})

                # If the first MSE - new MSE has improved more than min_delta, remove the first one so we can move on.
                if iteration > 0 and len(loss_list) >= patience and loss_list[0] - mse_valid > min_delta:
                    loss_list.pop(0)
                    sess_list.pop(0)
                elif iteration > 0 and len(loss_list) >= patience and loss_list[0] - mse_valid < min_delta:
                    print("Early stopping at " + str(iteration * batch_size/train_set_size) + ". Iteration " + str(iteration))    
                    early_stop_time = iteration * batch_size/train_set_size
                    break

                loss_list.append(mse_valid)
                sess_list.append(sess)

            # If we early stop, use the first one in the list, else use the last one since we hit epoch.
            if early_stop_time == -1:
                sess = sess_list[-1]
            else:
                sess = sess_list[0]

            y_train_pred = sess.run(outputs, feed_dict={X: x_train,is_training: True})
            y_valid_pred = sess.run(outputs, feed_dict={X: x_valid,is_training: False})
            y_test_pred = sess.run(outputs, feed_dict={X: x_test,is_training: False})

            pred_indicator = [1 if sdk[1] >= sdk[0] else 0 for sdk in y_test_pred]
            test_indicator = [1 if sdk[1] >= sdk[0] else 0 for sdk in y_test]

            comp = list()
            correct = 0
            test_position = 1
            for i in range(len(pred_indicator)):
                if pred_indicator[i] == test_indicator[i]:
                    correct += 1
                    comp.append(1)
                else:
                    comp.append(0)

                daily_change = (df_stock['Close'].iloc[returns_point + i] - df_stock['Open'].iloc[returns_point + i])/df_stock['Open'].iloc[returns_point + i]
                test_position *= 1 + (daily_change * (1 if pred_indicator[i] == 1 else -1))
                
#             total_change = df_stock['Close'][-1]/df_stock['Close'][-1 * returns_point]
#             stop = timeit.default_timer()
#             model_results.loc[(name, ticker), ('accuracy', 'total_change', 'predicted_investment', 'early_stop', 'time_in_seconds')] = [correct/len(pred_indicator), total_change, test_position, early_stop_time, (stop - start)]
            total_change = df_stock['Close'][-1]/df_stock['Close'][-1 * returns_point]
            stop = timeit.default_timer()
            model_results.loc[(name, ticker), ('accuracy', 'total_change', 'predicted_investment', 'early_stop', 'time_in_seconds')] = [correct/len(pred_indicator), total_change, test_position, early_stop_time, (stop - start)]
            print("Finished " + str(name) + " - " + str(ticker) + " in " + str((stop - start)) + " seconds.")

Starting ctxs




Early stopping at 6.69281045751634. Iteration 10
Finished replication_attention_lstm - ctxs in 69.89010036903812 seconds.
Starting tpr




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - tpr in 44.14559853636308 seconds.
Starting fti




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - fti in 43.627159430496874 seconds.
Starting cms




Early stopping at 4.448305821025196. Iteration 10
Finished replication_attention_lstm - cms in 56.34738848785395 seconds.
Starting stx




Early stopping at 12.8. Iteration 11
Finished replication_attention_lstm - stx in 46.80829132564159 seconds.
Starting cf




Early stopping at 12.075471698113208. Iteration 10
Finished replication_attention_lstm - cf in 41.48477568433043 seconds.
Starting mos




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - mos in 43.473590763067534 seconds.
Starting expd




Early stopping at 5.3222453222453225. Iteration 10
Finished replication_attention_lstm - expd in 53.936377191430495 seconds.
Starting cinf




Early stopping at 5.294725956566701. Iteration 10
Finished replication_attention_lstm - cinf in 54.09779936773708 seconds.
Starting alb




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - alb in 43.629667092005775 seconds.
Starting mas




Early stopping at 4.2419221209610605. Iteration 10
Finished replication_attention_lstm - mas in 56.86433255414545 seconds.
Starting cboe




Early stopping at 20.15748031496063. Iteration 10
Finished replication_attention_lstm - cboe in 25.91669022814142 seconds.
Starting m




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - m in 43.30200026965326 seconds.
Starting var




Early stopping at 12.8. Iteration 11
Finished replication_attention_lstm - var in 46.698140245862305 seconds.
Starting tmk




Early stopping at 5.873804971319312. Iteration 12
Finished replication_attention_lstm - tmk in 63.93311632124687 seconds.
Starting irm




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - irm in 43.494871183681084 seconds.
Starting qrvo




Early stopping at 69.03370786516854. Iteration 12
Finished replication_attention_lstm - qrvo in 12.852745428552453 seconds.
Starting uaa




Early stopping at 12.337349397590362. Iteration 10
Finished replication_attention_lstm - uaa in 40.41363444234389 seconds.
Starting wu




Early stopping at 13.2987012987013. Iteration 10
Finished replication_attention_lstm - wu in 37.72145273700903 seconds.
Starting flir




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - flir in 42.97200214129714 seconds.
Starting slg




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - slg in 43.39118211095263 seconds.
Starting xray




Early stopping at 5.517241379310345. Iteration 10
Finished replication_attention_lstm - xray in 53.27422029533295 seconds.
Starting alk




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - alk in 43.063049937621486 seconds.
Starting xrx




Early stopping at 3.56794425087108. Iteration 10
Finished replication_attention_lstm - xrx in 58.62116277442465 seconds.
Starting evhc




Early stopping at 36.056338028169016. Iteration 10
Finished replication_attention_lstm - evhc in 15.757774575566373 seconds.
Starting fl




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - fl in 43.213807725850074 seconds.
Starting hrb




Early stopping at 4.714548802946593. Iteration 10
Finished replication_attention_lstm - hrb in 55.029651049343556 seconds.
Starting srcl




Early stopping at 11.636363636363637. Iteration 10
Finished replication_attention_lstm - srcl in 42.989038083117975 seconds.
Starting ctxs




Early stopping at 6.69281045751634. Iteration 10
Finished replication_lstm - ctxs in 7.996956445310161 seconds.
Starting tpr




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - tpr in 6.857734013799927 seconds.
Starting fti




Early stopping at 12.8. Iteration 11
Finished replication_lstm - fti in 7.3348854514104005 seconds.
Starting cms




Early stopping at 4.448305821025196. Iteration 10
Finished replication_lstm - cms in 8.738813514293042 seconds.
Starting stx




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - stx in 6.9810958275995745 seconds.
Starting cf




Early stopping at 12.075471698113208. Iteration 10
Finished replication_lstm - cf in 6.83050074154653 seconds.
Starting mos




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - mos in 7.431732499418331 seconds.
Starting expd




Early stopping at 5.3222453222453225. Iteration 10
Finished replication_lstm - expd in 8.351175580110066 seconds.
Starting cinf




Early stopping at 5.294725956566701. Iteration 10
Finished replication_lstm - cinf in 8.322499037155922 seconds.
Starting alb




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - alb in 6.896250625066386 seconds.
Starting mas




Early stopping at 4.2419221209610605. Iteration 10
Finished replication_lstm - mas in 8.780249662506776 seconds.
Starting cboe




Early stopping at 20.15748031496063. Iteration 10
Finished replication_lstm - cboe in 4.655151031423657 seconds.
Starting m




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - m in 6.875651164259125 seconds.
Starting var




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - var in 6.948010397010421 seconds.
Starting tmk




Early stopping at 5.873804971319312. Iteration 12
Finished replication_lstm - tmk in 9.736695541562085 seconds.
Starting irm




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - irm in 7.120575965702756 seconds.
Starting qrvo




Early stopping at 57.52808988764045. Iteration 10
Finished replication_lstm - qrvo in 2.855585321077797 seconds.
Starting uaa




Early stopping at 12.337349397590362. Iteration 10
Finished replication_lstm - uaa in 6.616786882336328 seconds.
Starting wu




Early stopping at 14.628571428571428. Iteration 11
Finished replication_lstm - wu in 7.405689911270429 seconds.
Starting flir




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - flir in 6.938790986299409 seconds.
Starting slg




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - slg in 7.0245549441267485 seconds.
Starting xray




Early stopping at 5.517241379310345. Iteration 10
Finished replication_lstm - xray in 8.376028053040955 seconds.
Starting alk




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - alk in 6.9565414523615345 seconds.
Starting xrx




Early stopping at 3.56794425087108. Iteration 10
Finished replication_lstm - xrx in 9.285983207542358 seconds.
Starting evhc




Early stopping at 36.056338028169016. Iteration 10
Finished replication_lstm - evhc in 3.512398645362282 seconds.
Starting fl




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - fl in 7.173585017291543 seconds.
Starting hrb




Early stopping at 4.714548802946593. Iteration 10
Finished replication_lstm - hrb in 8.688211430908723 seconds.
Starting srcl




Early stopping at 11.636363636363637. Iteration 10
Finished replication_lstm - srcl in 7.075476742975525 seconds.


In [14]:
print(model_results)

                                   accuracy total_change predicted_investment  \
name                       ticker                                               
replication_attention_lstm ctxs    0.477064      1.79131             0.306914   
                           tpr     0.460317      1.38707              0.84486   
                           fti     0.507937      0.88117              0.74983   
                           cms     0.518293      2.74929             0.853589   
                           stx     0.603175     0.847461             0.983753   
                           cf      0.590164     0.738116              1.15352   
                           mos     0.634921     0.571186             0.872276   
                           expd    0.492754      1.68009              1.03159   
                           cinf    0.507246      2.14903              0.79609   
                           alb     0.555556      2.85639              1.03556   
                           m

In [15]:
#model_results.to_csv('tf_replication_SP.csv')