In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import time
from sklearn.preprocessing import RobustScaler

In [4]:
model_folder = 'models-Intraday-240-1-LSTM'
result_folder = 'results-Intraday-240-1-LSTM'
for directory in [model_folder,result_folder]:
    if not os.path.exists(directory):
        os.makedirs(directory)
SP500_df = pd.read_csv('data/SPXconst_2020_new.csv')
all_companies = list(set(SP500_df.values.flatten()))
all_companies.remove(np.nan)

In [5]:
constituents = {'-'.join(col.split('/')[::-1]):set(SP500_df[col].dropna()) 
                for col in SP500_df.columns}
constituents_train = {} 
for test_year in range(1993,2016):
    months = [str(t)+'-0'+str(m) if m<10 else str(t)+'-'+str(m) 
              for t in range(test_year-3,test_year) for m in range(1,13)]
    constituents_train[test_year] = [list(constituents[m]) for m in months]
    constituents_train[test_year] = set([i for sublist in constituents_train[test_year] 
                                         for i in sublist])

In [6]:
def create_label(df_open,df_close,perc=[0.5,0.5]):
    if not np.all(df_close.iloc[:,0]==df_open.iloc[:,0]):
        print('Date Index issue')
        return
    perc = [0.]+list(np.cumsum(perc))
    label = (df_close.iloc[:,1:]/df_open.iloc[:,1:]-1).apply(
            lambda x: pd.qcut(x.rank(method='first'),perc,labels=False), axis=1)
    return label[1:]
def create_stock_data(df_open,df_close,st,m=240):
    st_data = pd.DataFrame([])
    st_data['Date'] = list(df_close['Date'])
    st_data['Name'] = [st]*len(st_data)
    daily_change = df_close[st]/df_open[st]-1
    for k in range(m)[::-1]:
        st_data['IntraR'+str(k)] = daily_change.shift(k)

    st_data['IntraR-future'] = daily_change.shift(-1)  # 将后一天赋值给当前的日期  
    st_data['label'] = list(label[st])+[np.nan] #最后一个加一个nan
    st_data['Month'] = list(df_close['Date'].str[:-3]) # 去掉后面的天，留月份
    st_data = st_data.dropna()
    
    trade_year = st_data['Month'].str[:4] # 取年份
    st_data = st_data.drop(columns=['Month'])
    st_train_data = st_data[trade_year<str(test_year)] # 交易年份小于测试年份的都是训练年份
    st_test_data = st_data[trade_year==str(test_year)] # 交易年份是测试年份的则是测试年份
    return np.array(st_train_data),np.array(st_test_data)

In [28]:
test_year = 1993
# for test_year in range(1993,2020):
#     print('-'*40)
#     print(test_year)
#     print('-'*40)
filename = 'Open_Close/Open-'+str(test_year-3)+'.csv'
df_open = pd.read_csv(filename)
filename = 'Open_Close/Close-'+str(test_year-3)+'.csv'
df_close = pd.read_csv(filename)
colums = df_open.columns
df_open[colums] = df_open[colums].replace(0,np.nan)
df_open.loc[:,:]

Unnamed: 0,Date,AAPL,CSCO,UAL,TROW,ISRG,NVR,TPR,DVN,CE,...,CRM,PGR,WAT,IEX,BWA,LRCX,NWL,UAA,BLK,PPL
0,1990-01-02,0.251798,,,0.921079,,4.824210,,,,...,,0.564747,,2.177134,,,4.524198,,,1.844329
1,1990-01-03,0.271442,,,0.936560,,4.944816,,,,...,,0.570284,,2.209388,,,4.856481,,,1.860459
2,1990-01-04,0.273228,,,0.944300,,4.824211,,,,...,,0.568438,,2.177133,,,4.882043,,,1.849706
3,1990-01-05,0.269656,,,0.952040,,5.065421,,,,...,,0.568438,,2.112624,,,4.830923,,,1.833574
4,1990-01-08,0.267870,,,0.959780,,4.944816,,,,...,,0.568437,,2.080372,,,4.805360,,,1.812066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008,1993-12-27,0.206849,1.255515,,1.954947,,9.250000,,6.712947,,...,,1.785410,,4.531662,2.283227,8.793470,8.962267,,,3.017184
1009,1993-12-28,0.214302,1.250494,,1.954947,,9.125000,,6.930897,,...,,1.768187,,4.547788,2.395150,8.941261,8.907456,,,3.017184
1010,1993-12-29,0.218030,1.265560,,1.921241,,9.125000,,7.018077,,...,,1.791150,,4.596169,2.395151,8.941261,8.880045,,,3.031089
1011,1993-12-30,0.212439,1.285648,,1.954947,,9.125000,,7.105260,,...,,1.819853,,4.580043,2.439920,9.015156,8.770417,,,3.031090


In [29]:
label = create_label(df_open,df_close)

In [30]:
stock_names = sorted(list(constituents[str(test_year-1)+'-12']))

In [31]:
train_data,test_data = [],[]
start = time.time()

In [32]:
for st in stock_names:
        st_train_data,st_test_data = create_stock_data(df_open,df_close,st)
        train_data.append(st_train_data)
        test_data.append(st_test_data)

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [39]:
train_data

(31889580,)

In [36]:
train_data = np.concatenate([x for x in train_data])
test_data = np.concatenate([x for x in test_data])

In [38]:
train_data

(31889580,)

In [37]:
scaler = RobustScaler()
scaler.fit(train_data[:,2:-2])

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed