In [2]:
import pandas as pd
import numpy as np
import random
import time
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from Statistics import Statistics

import tensorflow as tf
from tensorflow.compat.v1.keras.layers import CuDNNLSTM, Dropout,Dense,Input 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from tensorflow.compat.v1.keras.models import Model, Sequential, load_model
from tensorflow.keras import optimizers
import warnings
warnings.filterwarnings("ignore")

import os

In [59]:
SP500_df = pd.read_csv('data/SPXconst_2020_new_index.csv')
all_companies = list(set(SP500_df.values.flatten()))
all_companies.remove(np.nan)
constituents = {'-'.join(col.split('/')[::-1]):set(SP500_df[col].dropna()) 
                for col in SP500_df.columns}
constituents

{'1990-01': {'ABMD',
  'ABT',
  'ADBE',
  'ADI',
  'ADM',
  'ADP',
  'ADSK',
  'AEP',
  'AFL',
  'AIG',
  'AJG',
  'ALK',
  'AMAT',
  'AMD',
  'AME',
  'AMGN',
  'AON',
  'AOS',
  'APA',
  'APD',
  'ATO',
  'AVY',
  'AXP',
  'BA',
  'BAC',
  'BAX',
  'BBWI',
  'BBY',
  'BDX',
  'BEN',
  'BF-B',
  'BIO',
  'BK',
  'BKR',
  'BLL',
  'BMY',
  'BRO',
  'C',
  'CAG',
  'CAH',
  'CAT',
  'CCL',
  'CDNS',
  'CERN',
  'CHD',
  'CI',
  'CINF',
  'CL',
  'CLX',
  'CMA',
  'CMCSA',
  'CMI',
  'CMS',
  'CNP',
  'COO',
  'COP',
  'COST',
  'CPB',
  'CSX',
  'CTAS',
  'CVS',
  'CVX',
  'D',
  'DD',
  'DE',
  'DHR',
  'DIS',
  'DOV',
  'DRE',
  'DTE',
  'DUK',
  'DVN',
  'DXC',
  'EA',
  'ECL',
  'ED',
  'EFX',
  'EIX',
  'EMR',
  'EOG',
  'ES',
  'ETN',
  'ETR',
  'EVRG',
  'EXC',
  'EXPD',
  'F',
  'FAST',
  'FDX',
  'FISV',
  'FITB',
  'FMC',
  'FRT',
  'GD',
  'GE',
  'GIS',
  'GL',
  'GLW',
  'GPC',
  'GWW',
  'HAL',
  'HAS',
  'HBAN',
  'HD',
  'HES',
  'HON',
  'HPQ',
  'HRL',
  'HST',
  'HSY'

In [60]:
# create_label
# 参数
def create_label(df_open,df_close,perc=[0.5,0.5]):
# 判断收盘日期是不是和开盘日期一一对应
    if not np.all(df_close.iloc[:,0]==df_open.iloc[:,0]):
        print('Date Index issue')
        return
    perc = [0.]+list(np.cumsum(perc)) #[0.0, 0.5, 1.0]
    label = (df_close.iloc[:,1:]/df_open.iloc[:,1:]-1).apply(
                lambda x: pd.qcut(x.rank(method='first'),perc,labels=False), axis=1)
    return label[1:]

In [71]:
# 创建股票数据
# creae_stock_data
st = 'AAPL'
m = 240
st_data = pd.DataFrame([])
st_data['Date'] = list(df_close['Date'])
st_data['Name'] = [st]*len(st_data)
daily_change  = df_close[st]/df_open[st]-1 #  特征① st股票每日的涨幅
for k in range(m)[::-1]:   # [::-1] 倒序               
    st_data['IntraR'+str(k)] = daily_change.shift(k)
nextday_ret = (np.array(df_open[st][1:])/np.array(df_close[st][:-1])-1) # 特征②下一天的增长率
nextday_ret = pd.Series(list(nextday_ret)+[np.nan])     
for k in range(m)[::-1]:
    st_data['NextR'+str(k)] = nextday_ret.shift(k)

close_change = df_close[st].pct_change()   # next_day - today /today 特征③收盘价相对前一天回报率
for k in range(m)[::-1]:
    st_data['CloseR'+str(k)] = close_change.shift(k)
    
st_data['IntraR-future'] = daily_change.shift(-1)   # 特征④ 今天知道明天的涨幅回报 
st_data['label'] = list(label[st])+[np.nan]         # 特征⑤ 标签标记当日金额是涨还是不动还是跌
st_data['Month'] = list(df_close['Date'].str[:-3])  # 特征⑥ 取月份
st_data = st_data.dropna()
trade_year = st_data['Month'].str[:4]
# st_data = st_data.drop(columns=['Month'])
# trade_year
st

In [61]:
test_year = 1999
filename = 'Open_Close/Open-'+str(test_year-3)+'.csv'
df_open =pd.read_csv(filename)
filename = 'Open_Close/Close-'+str(test_year-3)+'.csv'
df_close = pd.read_csv(filename)
colums_open = df_open.columns
df_open[colums_open] = df_open[colums_open].replace(0,np.nan)
colums_close = df_open.columns
df_open[colums_close] = df_open[colums_close].replace(0,np.nan)
label = create_label(df_open,df_close)
stock_name = sorted(list(constituents[str(test_year-1)+'-12']))
train_data,test_data = [],[]
start = time.time()
# for st in stock_name:
#     st_train_data,st_test_data = 