In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import time
from sklearn.preprocessing import RobustScaler

In [6]:
SP500_df = pd.read_csv('example_data/SPXconst.csv')
all_companies = list(set(SP500_df.values.flatten()))
all_companies.remove(np.nan)

In [7]:
constituents = {'-'.join(col.split('/')[::-1]):set(SP500_df[col].dropna()) 
                for col in SP500_df.columns}
constituents_train = {} 
for test_year in range(1993,2016):
    months = [str(t)+'-0'+str(m) if m<10 else str(t)+'-'+str(m) 
              for t in range(test_year-3,test_year) for m in range(1,13)]
    constituents_train[test_year] = [list(constituents[m]) for m in months]
    constituents_train[test_year] = set([i for sublist in constituents_train[test_year] 
                                         for i in sublist])

In [4]:
def create_label(df_open,df_close,perc=[0.5,0.5]):
    if not np.all(df_close.iloc[:,0]==df_open.iloc[:,0]):
        print('Date Index issue')
        return
    perc = [0.]+list(np.cumsum(perc))
    label = (df_close.iloc[:,1:]/df_open.iloc[:,1:]-1).apply(
            lambda x: pd.qcut(x.rank(method='first'),perc,labels=False), axis=1)
    return label[1:]
def create_stock_data(df_open,df_close,st,m=240):
    st_data = pd.DataFrame([])
    st_data['Date'] = list(df_close['Date'])
    st_data['Name'] = [st]*len(st_data)
    daily_change = df_close[st]/df_open[st]-1
    for k in range(m)[::-1]:
        st_data['IntraR'+str(k)] = daily_change.shift(k)

    st_data['IntraR-future'] = daily_change.shift(-1)  # 将后一天赋值给当前的日期  
    st_data['label'] = list(label[st])+[np.nan] #最后一个加一个nan
    st_data['Month'] = list(df_close['Date'].str[:-3]) # 去掉后面的天，留月份
    st_data = st_data.dropna()
    
    trade_year = st_data['Month'].str[:4] # 取年份
    st_data = st_data.drop(columns=['Month'])
    st_train_data = st_data[trade_year<str(test_year)] # 交易年份小于测试年份的都是训练年份
    st_test_data = st_data[trade_year==str(test_year)] # 交易年份是测试年份的则是测试年份
    return np.array(st_train_data),np.array(st_test_data)

In [3]:
test_year = 1993
filename = 'example_data/Open-'+str(test_year-3)+'.csv'
df_open = pd.read_csv(filename)
filename = 'example_data/Close-'+str(test_year-3)+'.csv'
df_close = pd.read_csv(filename)
# colums = df_open.columns
# df_open[colums] = df_open[colums].replace(0,np.nan)
# df_open.loc[:,:]
df_open

Unnamed: 0,Date,S129,S1099,S251,S1083,S940,S1134,S1275,S912,S1143,...,S19,S1117,S1321,S809,S75,S834,S1066,S408,S935,S1212
0,1990-01-02,1.33103,1.07115,2.20001,1.57617,1.00521,2.00071,2.51096,1.43664,2.44258,...,1.20592,1.55549,1.21377,,1.37474,1.10943,1.94760,1.54678,3.14967,1.56683
1,1990-01-03,1.32610,1.12461,2.31964,1.63177,1.09695,2.05903,2.63856,1.43780,2.47402,...,1.29022,1.61168,1.24447,,1.39436,1.11050,1.98664,1.57925,3.21192,1.58377
2,1990-01-04,1.31346,1.16587,2.23727,1.63424,1.06693,2.00304,2.67351,1.41660,2.38169,...,1.26476,1.55677,1.24834,,1.38928,1.09701,1.98286,1.58223,3.18242,1.61314
3,1990-01-05,1.28503,1.15661,2.20042,1.58966,1.05790,1.94599,2.67631,1.37909,2.30728,...,1.22083,1.53661,1.21756,,1.35736,1.09056,1.99340,1.55345,3.09184,1.61097
4,1990-01-08,1.27909,1.14932,2.16062,1.59672,1.04942,1.91052,2.66100,1.36326,2.23686,...,1.21761,1.54220,1.20981,,1.34297,1.08396,1.99639,1.54164,3.09002,1.64660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1007,1993-12-27,1.63051,1.59137,5.07444,4.18770,2.37723,1.14329,3.92387,1.27498,2.83442,...,3.04259,2.54138,1.44360,,1.52392,1.41662,3.05165,1.45870,1.72818,1.00748
1008,1993-12-28,1.62645,1.58454,5.08183,4.30786,2.37581,1.17375,3.85786,1.27320,2.82314,...,3.06701,2.56032,1.46827,,1.52830,1.42623,3.05614,1.45269,1.75498,1.01911
1009,1993-12-29,1.63025,1.57257,5.07966,4.28293,2.39762,1.21666,3.86412,1.29406,2.85079,...,3.07143,2.57052,1.47879,,1.52444,1.40376,3.03945,1.46751,1.75907,1.03488
1010,1993-12-30,1.67715,1.58768,4.96174,4.25863,2.35529,1.23154,3.84170,1.28644,2.87286,...,3.04820,2.54445,1.48073,,1.51256,1.41256,3.03054,1.46734,1.69611,1.07148


In [8]:
label = create_label(df_open,df_close)

In [9]:
stock_names = sorted(list(constituents[str(test_year-1)+'-12']))

In [10]:
train_data,test_data = [],[]
start = time.time()

In [11]:
for st in stock_names:
        st_train_data,st_test_data = create_stock_data(df_open,df_close,st)
        train_data.append(st_train_data)
        test_data.append(st_test_data)

KeyError: 'A'