In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import time
from sklearn.preprocessing import RobustScaler

In [16]:
model_folder = 'models-Intraday-240-1-LSTM'
result_folder = 'results-Intraday-240-1-LSTM'
for directory in [model_folder,result_folder]:
    if not os.path.exists(directory):
        os.makedirs(directory)
SP500_df = pd.read_csv('data/SPXconst_2020_new.csv')
all_companies = list(set(SP500_df.values.flatten()))
all_companies.remove(np.nan)

In [17]:
constituents = {'-'.join(col.split('/')[::-1]):set(SP500_df[col].dropna()) 
                for col in SP500_df.columns}
constituents_train = {} 
for test_year in range(1993,2016):
    months = [str(t)+'-0'+str(m) if m<10 else str(t)+'-'+str(m) 
              for t in range(test_year-3,test_year) for m in range(1,13)]
    constituents_train[test_year] = [list(constituents[m]) for m in months]
    constituents_train[test_year] = set([i for sublist in constituents_train[test_year] 
                                         for i in sublist])

In [24]:
def create_label(df_open,df_close,perc=[0.5,0.5]):
    if not np.all(df_close.iloc[:,0]==df_open.iloc[:,0]):
        print('Date Index issue')
        return
    perc = [0.]+list(np.cumsum(perc))
    label = (df_close.iloc[:,1:]/df_open.iloc[:,1:]-1).apply(
            lambda x: pd.qcut(x.rank(method='first'),perc,labels=False), axis=1)
    return label[1:]
def create_stock_data(df_open,df_close,st,m=240):
    st_data = pd.DataFrame([])
    st_data['Date'] = list(df_close['Date'])
    st_data['Name'] = [st]*len(st_data)
    daily_change = df_close[st]/df_open[st]-1
    for k in range(m)[::-1]:
        st_data['IntraR'+str(k)] = daily_change.shift(k)

    st_data['IntraR-future'] = daily_change.shift(-1)  # 将后一天赋值给当前的日期  
    st_data['label'] = list(label[st])+[np.nan] #最后一个加一个nan
    st_data['Month'] = list(df_close['Date'].str[:-3]) # 去掉后面的天，留月份
    st_data = st_data.dropna()
    
    trade_year = st_data['Month'].str[:4] # 取年份
    st_data = st_data.drop(columns=['Month'])
    st_train_data = st_data[trade_year<str(test_year)] # 交易年份小于测试年份的都是训练年份
    st_test_data = st_data[trade_year==str(test_year)] # 交易年份是测试年份的则是测试年份
    return np.array(st_train_data),np.array(st_test_data)

In [42]:
test_year =1993
# for test_year in range(1993,2020):
#     print('-'*40)
#     print(test_year)
#     print('-'*40)
filename = 'Open_Close/Open-'+str(test_year-3)+'.csv'
df_open = pd.read_csv(filename)
filename = 'Open_Close/Close-'+str(test_year-3)+'.csv'
df_close = pd.read_csv(filename)
df_close.loc[:60,:]

Unnamed: 0,Date,AAPL,CSCO,UAL,TROW,ISRG,NVR,TPR,DVN,CE,...,CRM,PGR,WAT,IEX,BWA,LRCX,NWL,UAA,BLK,PPL
0,1990-01-02,0.266084,,,0.928819,,4.944816,,4.601492,,...,,0.568438,,2.193261,,1.108421,4.907605,,,1.849706
1,1990-01-03,0.267870,,,0.944300,,4.944816,,4.688312,,...,,0.572129,,2.193261,,1.083789,4.882041,,,1.855081
2,1990-01-04,0.268764,,,0.967520,,5.065421,,4.688312,,...,,0.568438,,2.128752,,0.985263,4.856483,,,1.838952
3,1990-01-05,0.269656,,,0.959780,,5.186026,,4.601492,,...,,0.570284,,2.112624,,0.985263,4.856483,,,1.822820
4,1990-01-08,0.271442,,,0.967520,,5.186026,,4.471262,,...,,0.572129,,2.144881,,0.985263,4.882041,,,1.801312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,1990-03-22,0.292023,0.056498,,0.933734,,4.000000,,4.384441,,...,,0.518484,,1.628819,,1.330105,5.215108,,,1.812140
57,1990-03-23,0.302773,0.058381,,0.918171,,4.250000,,4.384441,,...,,0.516632,,1.628819,,1.330105,5.240798,,,1.828565
58,1990-03-26,0.302773,0.060892,,0.918171,,4.125000,,4.341032,,...,,0.522187,,1.628819,,1.453263,5.163728,,,1.817615
59,1990-03-27,0.300981,0.062148,,0.918171,,4.125000,,4.297620,,...,,0.522187,,1.628819,,1.453263,5.138040,,,1.817615


In [15]:
label = create_label(df_open,df_close)

In [18]:
stock_names = sorted(list(constituents[str(test_year-1)+'-12']))

In [22]:
train_data,test_data = [],[]
start = time.time()

In [26]:
for st in stock_names:
        st_train_data,st_test_data = create_stock_data(df_open,df_close,st)
        train_data.append(st_train_data)
        test_data.append(st_test_data)

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [27]:
train_data = np.concatenate([x for x in train_data])
test_data = np.concatenate([x for x in test_data])

In [34]:

# scaler = RobustScaler()
# scaler.fit(train_data[:,2:-2])

array(['1991-06-13', 'ABMD', -0.03409090909090906, -0.04705882352941182,
       -0.024691358024691357, -0.012658227848101222, 0.0, 0.0,
       0.01298701298701288, -0.012820512820512775, 0.0,
       0.012820512820512775, -0.012658227848101222, 0.012820512820512775,
       0.025316455696202445, -0.012499999999999956, 0.0,
       -0.012658227848101222, 0.0, 0.0, -0.012658227848101222, 0.0,
       0.012820512820512775, -0.038461538461538436, -0.06666666666666665,
       0.0, -0.02941176470588236, 0.02941176470588225,
       0.05714285714285716, 0.0, 0.0, 0.040000000000000036,
       -0.012820512820512775, -0.013157894736842146, 0.01333333333333342,
       0.01333333333333342, -0.03947368421052633, -0.01388888888888884,
       -0.028169014084507005, 0.01449275362318847, 0.0,
       0.014084507042253502, -0.01388888888888884, -0.014084507042253502,
       0.0, -0.028169014084507005, 0.0, -0.01449275362318836,
       0.044117647058823595, -0.01449275362318836, -0.014705882352941124,
       0