In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)


In [2]:
stock_list = pd.read_csv('../data/stock_list.csv')
stock_price = pd.read_csv('../data/stock_price.csv')
stock_fin = pd.read_csv('../data/stock_fin.csv', index_col='base_date')
stock_labels = pd.read_csv('../data/stock_labels.csv')

# Datetimeに変換
stock_fin.index = pd.to_datetime(stock_fin.index)

### 特徴量の生成
<テクニカル指標>  
* 移動平均乖離率(5日、25日、75日)
* ヒストリカルボラティリティ(5日, 10日, 25日, 50日, 75日, 100日) 
* 過去n日間の最高値に対する終値・高値（安値)との乖離率(n=5, 10, 20)  
* キリ番との乖離率(9,999円未満の株価は100円台を基準、 10,000円は1000円台を基準)  
* RSI
* H-L_C: 株価に対する1日当たりの値幅
* MAXX_H-LC: H-L_Cの移動平均(XX = 5日, 10日, 25日, 50日, 75日, 100日)
* 各ヒストリカルボラティリティの移動平均(20日)

今回の予測対象は最大値と最小値であるため。高値と安値に関する指標あったほうがよさそう。  
実際、株価は過去の株価の高値や安値をサポートラインとすることが多い。  
キリ番(キリが良い番号。100円とか)もサポートラインになることが多い。  
  
<ファンダメンタル指標>
* 売上高営業利益率, 売上高経常利益率, 売上高当期純利益率 
* 前期比(成長率)  
* 来期予想成長率  
* 自己資本比率
* ROE(当期純利益÷自己資本)
* ROA
* PER, PBRは用意されたデータセットの発行済み株式数では未来情報のため利用しない
* キャッシュフローの正負(四半期データは欠損値となっている。それらはゼロにする)
* 配当利回り  
  欠損値が少ない一株当たりの四半期配当金を利用する。欠損値は、データの開始時点から1年が多く見られるため、問題なさそう。
* 決算種別:Result_FinancialStatement ReportType
  季節効果を期待して組み込んでみる

 
 <価格・その他>
* 出来高:EndOfDayQuote Volume  
* 業種区分:17 Sector(Code)

※価格情報の利用を避けるために率に変換するなどして銘柄間のスケールを統一する。  
※カテゴリデータはone-hotベクトル化  
※基準化は業種区分ごとでの平均ですべきかもしれないが今回は時間の都合上、諦める


In [3]:
codes = sorted(set(stock_price['Local Code'].values))

In [4]:
# load
df_all_code_merged = pd.read_csv('../data/all_code_tech_fund2.csv', index_col=[0])
df_all_code_merged.index = pd.to_datetime(df_all_code_merged.index)
print(df_all_code_merged.shape)
df_all_code_merged.head(3)

(61440, 106)


Unnamed: 0,Local Code,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP,log_R,return_5,return_25,return_75,HV_5,HV_10,HV_25,HV_50,HV_75,HV_100,MA20_HV5,MA20_HV10,MA20_HV25,MA20_HV50,MA20_HV75,MA20_HV100,MADR5,MADR25,MADR75,MXDR5,MXDR10,MXDR20,MNDR5,MNDR10,MNDR20,RNDR,RSI,H-L_C,MA20_H-L_C,Local Code_fund,Result_FinancialStatement AccountingStandard,Result_FinancialStatement FiscalPeriodEnd,Result_FinancialStatement ReportType,Result_FinancialStatement FiscalYear,Result_FinancialStatement ModifyDate,...,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement AccountingStandard,Forecast_FinancialStatement FiscalPeriodEnd,Forecast_FinancialStatement ReportType,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement ModifyDate,Forecast_FinancialStatement CompanyType,Forecast_FinancialStatement ChangeOfFiscalYearEnd,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalPeriodEnd,Result_Dividend ReportType,Result_Dividend FiscalYear,Result_Dividend ModifyDate,Result_Dividend RecordDate,Result_Dividend DividendPayableDate,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalPeriodEnd,Forecast_Dividend ReportType,Forecast_Dividend FiscalYear,Forecast_Dividend ModifyDate,Forecast_Dividend RecordDate,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare,OperatingIncome_NetSales,OrdinaryIncome_NetSales,NetIncome_NetSales,NetSales_Growth,OperatingIncome_Growth,OrdinaryIncome_Growth,NetIncome_Growth,Forecast_NetSales_Growth,Forecast_OperatingIncome_Growth,Forecast_OrdinaryIncome_Growth,Forecast_NetIncome_Growth,Capital_Ratio,ROE,ROA,CF_Operating_pn,CF_Financing_pn,CF_Investing_pn,Dividend_Yeild,17_Sector
2016-08-05,1301,2610.0,2620.0,2590.0,2600.0,2600.0,8900.0,0.1,2610.0,2016/08/04,2610.0,2016/08/04,-10.0,-0.383,2606.067,-0.003837,-0.018868,0.0,-0.007634,0.010664,0.009811,0.015279,0.017334,0.015602,0.016786,0.014745,0.016258,0.020654,0.017166,0.01703,0.016979,-0.006116,-0.010956,-0.012408,-0.011321,-0.015038,-0.015038,0.0,0.0,0.007782,0.0,0.411765,0.011538,0.009721,1301,ConsolidatedJP,2016/06,Q1,2017.0,2016-08-05,...,101632.0,22995.0,,,,ConsolidatedJP,2016/09,Q2,2017.0,2016/08/05,GB,False,117000.0,1400.0,1300.0,800.0,2016/03,Annual,2016.0,2016/08/05,2016/03/31,2016/06/27,5.0,5.0,2017/03,Annual,2017.0,2016/08/05,2017/03/31,50.0,50.0,0.008945,0.007279,0.010554,0.0,0.0,0.0,0.0,1.241122,1.997859,2.421053,0.451906,0.226257,0.023962,0.005422,0.0,0.0,0.0,0.001923,1
2016-11-04,1301,2765.0,2767.0,2686.0,2697.0,2697.0,31500.0,1.0,2760.0,2016/11/02,2760.0,2016/11/02,-63.0,-2.283,2720.273,-0.023082,-0.025298,-0.013533,0.033333,0.009818,0.010405,0.009007,0.010922,0.010838,0.014654,0.008587,0.008973,0.010579,0.011228,0.013337,0.014684,-0.01913,-0.019815,0.004634,-0.003242,-0.004676,-0.006463,0.0,0.0,0.0,-0.001111,0.35443,0.030033,0.009464,1301,ConsolidatedJP,2016/09,Q2,2017.0,2016-11-04,...,106554.0,23600.0,,,,ConsolidatedJP,2017/03,Annual,2017.0,2016/11/04,GB,False,244000.0,3500.0,3300.0,2100.0,2016/03,Annual,2016.0,2016/11/04,2016/03/31,2016/06/27,5.0,5.0,2017/03,Annual,2017.0,2016/11/04,2017/03/31,50.0,50.0,0.010687,0.009163,0.010824,0.0,0.0,0.0,0.0,1.226887,1.988898,2.286853,0.770658,0.221484,0.050254,0.011131,0.0,0.0,0.0,0.001854,1
2017-02-17,1301,2825.0,2833.0,2811.0,2826.0,2826.0,26600.0,1.0,2825.0,2017/02/16,2825.0,2017/02/16,1.0,0.035,2824.395,0.000354,0.029508,0.038207,0.021323,0.007324,0.005317,0.007095,0.007151,0.008334,0.008305,0.007371,0.007306,0.00714,0.007479,0.008712,0.009493,0.005122,0.03092,0.041907,-0.002816,-0.002816,-0.002816,0.019956,0.034216,0.036504,0.009286,0.815068,0.007785,0.006161,1301,ConsolidatedJP,2016/12,Q3,2017.0,2017-02-10,...,117168.0,25779.0,,,,ConsolidatedJP,2017/03,Annual,2017.0,2017/02/17,GB,False,244000.0,3500.0,3300.0,2100.0,2016/03,Annual,2016.0,2017/02/17,2016/03/31,2016/06/27,5.0,5.0,2017/03,Annual,2017.0,2017/02/17,2017/03/31,60.0,60.0,0.015958,0.015708,0.013607,0.006065,0.164167,0.051711,1.161518,0.355744,0.218663,0.167315,-0.142507,0.220017,0.095,0.020902,0.0,0.0,0.0,0.001769,1


In [5]:
# 説明変数を指定
explain_variables = ['log_R', 'return_5', 'return_25', 'return_75', 'HV_5', 'HV_10', 'HV_25', "HV_50", 'HV_75', 'HV_100',
                     'MA20_HV5', 'MA20_HV10', 'MA20_HV25', 'MA20_HV50', 'MA20_HV75', 'MA20_HV100', 'MADR5', 'MADR25',
                     'MADR75', 'MXDR5', 'MXDR10', 'MXDR20', 'MNDR5', 'MNDR10', 'MNDR20', 'RNDR', 'RSI', 'H-L_C', 'MA20_H-L_C',
                     'OperatingIncome_NetSales', 'OrdinaryIncome_NetSales', 'NetIncome_NetSales', 'NetSales_Growth', 
                     'OperatingIncome_Growth', 'OrdinaryIncome_Growth', 'NetIncome_Growth', 'Forecast_NetSales_Growth', 
                     'Forecast_OperatingIncome_Growth', 'Forecast_OrdinaryIncome_Growth', 'Forecast_NetIncome_Growth', 
                     'Capital_Ratio', 'ROE', 'ROA', 'CF_Operating_pn', 'CF_Financing_pn', 'CF_Investing_pn', 'Dividend_Yeild',
                     '17_Sector', 'EndOfDayQuote Volume', 'Result_FinancialStatement ReportType']

print('説明変数の数:', len(explain_variables))

説明変数の数: 50


## 説明変数と目的変数の分析

In [6]:
# 説明変数と目的変数を結合したdfを作成する
X_cols = explain_variables.copy()
X_cols.insert(0, 'Local Code')
X = df_all_code_merged.loc[:, X_cols]
X.reset_index(inplace=True)
X.rename(columns={'index':'base_date'}, inplace=True)

# カテゴリデータのone-hotベクトル化
category_cols = ['CF_Operating_pn', 'CF_Financing_pn', 'CF_Investing_pn', '17_Sector', 'Result_FinancialStatement ReportType']

X = pd.get_dummies(X, columns=category_cols).copy()
X

Unnamed: 0,base_date,Local Code,log_R,return_5,return_25,return_75,HV_5,HV_10,HV_25,HV_50,HV_75,HV_100,MA20_HV5,MA20_HV10,MA20_HV25,MA20_HV50,MA20_HV75,MA20_HV100,MADR5,MADR25,MADR75,MXDR5,MXDR10,MXDR20,MNDR5,MNDR10,MNDR20,RNDR,RSI,H-L_C,MA20_H-L_C,OperatingIncome_NetSales,OrdinaryIncome_NetSales,NetIncome_NetSales,NetSales_Growth,OperatingIncome_Growth,OrdinaryIncome_Growth,NetIncome_Growth,Forecast_NetSales_Growth,Forecast_OperatingIncome_Growth,Forecast_OrdinaryIncome_Growth,Forecast_NetIncome_Growth,Capital_Ratio,ROE,ROA,Dividend_Yeild,EndOfDayQuote Volume,CF_Operating_pn_-1.0,CF_Operating_pn_0.0,CF_Operating_pn_1.0,CF_Financing_pn_-1.0,CF_Financing_pn_0.0,CF_Financing_pn_1.0,CF_Investing_pn_-1.0,CF_Investing_pn_0.0,CF_Investing_pn_1.0,17_Sector_1,17_Sector_2,17_Sector_3,17_Sector_4,17_Sector_5,17_Sector_6,17_Sector_7,17_Sector_8,17_Sector_9,17_Sector_10,17_Sector_11,17_Sector_12,17_Sector_13,17_Sector_14,17_Sector_15,17_Sector_16,17_Sector_17,Result_FinancialStatement ReportType_Annual,Result_FinancialStatement ReportType_Q1,Result_FinancialStatement ReportType_Q2,Result_FinancialStatement ReportType_Q3
0,2016-08-05,1301,-0.003837,-0.018868,0.000000,-0.007634,0.010664,0.009811,0.015279,0.017334,0.015602,0.016786,0.014745,0.016258,0.020654,0.017166,0.017030,0.016979,-0.006116,-0.010956,-0.012408,-0.011321,-0.015038,-0.015038,0.000000,0.000000,0.007782,0.000000,0.411765,0.011538,0.009721,0.008945,0.007279,0.010554,0.000000,0.000000,0.000000,0.000000,1.241122,1.997859,2.421053,0.451906,0.226257,0.023962,0.005422,0.001923,8900.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,2016-11-04,1301,-0.023082,-0.025298,-0.013533,0.033333,0.009818,0.010405,0.009007,0.010922,0.010838,0.014654,0.008587,0.008973,0.010579,0.011228,0.013337,0.014684,-0.019130,-0.019815,0.004634,-0.003242,-0.004676,-0.006463,0.000000,0.000000,0.000000,-0.001111,0.354430,0.030033,0.009464,0.010687,0.009163,0.010824,0.000000,0.000000,0.000000,0.000000,1.226887,1.988898,2.286853,0.770658,0.221484,0.050254,0.011131,0.001854,31500.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,2017-02-17,1301,0.000354,0.029508,0.038207,0.021323,0.007324,0.005317,0.007095,0.007151,0.008334,0.008305,0.007371,0.007306,0.007140,0.007479,0.008712,0.009493,0.005122,0.030920,0.041907,-0.002816,-0.002816,-0.002816,0.019956,0.034216,0.036504,0.009286,0.815068,0.007785,0.006161,0.015958,0.015708,0.013607,0.006065,0.164167,0.051711,1.161518,0.355744,0.218663,0.167315,-0.142507,0.220017,0.095000,0.020902,0.001769,26600.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2017-05-11,1301,0.016579,0.035775,0.009967,0.110705,0.012836,0.011138,0.012920,0.014204,0.012303,0.011301,0.012153,0.013570,0.015501,0.013631,0.011806,0.010868,0.018153,0.036481,0.035992,0.000000,0.000000,0.000000,0.013988,0.022008,0.041710,0.013333,0.777778,0.022368,0.010969,0.015738,0.015679,0.010238,0.043839,0.530210,0.318053,0.346304,0.056810,0.074402,0.078458,0.114781,0.260712,0.095388,0.024869,0.019737,69900.0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,2017-08-04,1301,0.033142,0.030400,0.052288,0.111878,0.015734,0.012988,0.010828,0.012113,0.012486,0.013540,0.009493,0.009970,0.011270,0.012443,0.012819,0.013366,0.027441,0.034239,0.048915,0.000000,0.000000,0.000000,0.003236,0.004862,0.016393,0.006250,0.745763,0.040373,0.011913,0.017223,0.019404,0.013264,0.088840,1.096360,1.902632,0.368421,3.398002,3.085802,2.626473,2.580902,0.237940,0.029499,0.007019,0.018634,106000.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61435,2019-10-31,9997,-0.013967,0.020115,-0.004208,-0.054594,0.025591,0.019783,0.022239,0.025397,0.024801,0.025192,0.022580,0.023134,0.024189,0.026435,0.024576,0.025625,-0.000563,0.022937,0.047878,-0.005548,-0.005548,-0.008299,0.010072,0.057229,0.066869,0.014286,0.596491,0.021127,0.021655,0.044803,0.038227,0.019461,0.089198,0.003377,-0.501964,-0.603122,1.145624,2.624126,3.550971,5.257449,0.458051,0.016720,0.007659,0.011268,201900.0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
61436,2020-01-31,9997,-0.006349,-0.066964,-0.131579,-0.086006,0.021653,0.015882,0.019131,0.017553,0.018988,0.020270,0.018140,0.018649,0.019326,0.018735,0.019988,0.022161,-0.019086,-0.086379,-0.110991,-0.043609,-0.078261,-0.120332,0.001600,0.001600,0.001600,0.045000,0.063291,0.015949,0.016953,0.052872,0.053708,0.033392,0.030122,-0.140189,-0.346569,-0.359900,0.308349,0.512235,0.488699,0.523727,0.452265,0.044667,0.020201,0.012759,116300.0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
61437,2020-05-13,9997,-0.001951,0.042857,0.098925,-0.256186,0.039191,0.044421,0.046610,0.057053,0.048026,0.042651,0.046733,0.048331,0.060337,0.055089,0.045983,0.040768,0.009084,0.057708,-0.047187,-0.022857,-0.022857,-0.022857,0.043750,0.050314,0.096280,0.022000,0.603896,0.023483,0.032757,0.057300,0.057600,0.032576,0.012947,-0.141108,-0.322947,-0.433240,-0.567097,-1.058190,-1.009648,-1.010235,0.459490,0.057176,0.026272,0.015656,82800.0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
61438,2020-07-31,9997,-0.051960,0.012162,0.252508,0.553942,0.038806,0.039705,0.037120,0.036478,0.042413,0.048690,0.032722,0.035730,0.039866,0.041806,0.047035,0.049659,-0.033548,0.096055,0.274850,-0.007519,-0.007519,-0.007519,0.019048,0.060907,0.210016,0.070000,0.727642,0.057410,0.037302,0.050851,0.054992,0.030741,0.051544,0.264992,0.503662,0.340521,0.605060,-1.243112,-1.037467,-1.040214,0.450821,0.014374,0.006480,0.010681,338000.0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [7]:
# 目的変数
# high-lowの差分
stock_labels['high_low_5'] = stock_labels['label_high_5'] - stock_labels['label_low_5']
stock_labels['high_low_10'] = stock_labels['label_high_10'] - stock_labels['label_low_10']
stock_labels['high_low_20'] = stock_labels['label_high_20'] - stock_labels['label_low_20']

# high-lowの中間
stock_labels['center_5'] = (stock_labels['label_high_5'] + stock_labels['label_low_5']) / 2
stock_labels['center_10'] = (stock_labels['label_high_10'] + stock_labels['label_low_10']) / 2
stock_labels['center_20'] = (stock_labels['label_high_20'] + stock_labels['label_low_20']) / 2

# merge
stock_labels.base_date = pd.to_datetime(stock_labels.base_date)
XY = pd.merge(X, stock_labels, on=['base_date', 'Local Code']).copy()

XY.head()

Unnamed: 0,base_date,Local Code,log_R,return_5,return_25,return_75,HV_5,HV_10,HV_25,HV_50,HV_75,HV_100,MA20_HV5,MA20_HV10,MA20_HV25,MA20_HV50,MA20_HV75,MA20_HV100,MADR5,MADR25,MADR75,MXDR5,MXDR10,MXDR20,MNDR5,MNDR10,MNDR20,RNDR,RSI,H-L_C,MA20_H-L_C,OperatingIncome_NetSales,OrdinaryIncome_NetSales,NetIncome_NetSales,NetSales_Growth,OperatingIncome_Growth,OrdinaryIncome_Growth,NetIncome_Growth,Forecast_NetSales_Growth,Forecast_OperatingIncome_Growth,Forecast_OrdinaryIncome_Growth,Forecast_NetIncome_Growth,Capital_Ratio,ROE,ROA,Dividend_Yeild,EndOfDayQuote Volume,CF_Operating_pn_-1.0,CF_Operating_pn_0.0,CF_Operating_pn_1.0,CF_Financing_pn_-1.0,CF_Financing_pn_0.0,CF_Financing_pn_1.0,CF_Investing_pn_-1.0,CF_Investing_pn_0.0,CF_Investing_pn_1.0,17_Sector_1,17_Sector_2,17_Sector_3,17_Sector_4,17_Sector_5,17_Sector_6,17_Sector_7,17_Sector_8,17_Sector_9,17_Sector_10,17_Sector_11,17_Sector_12,17_Sector_13,17_Sector_14,17_Sector_15,17_Sector_16,17_Sector_17,Result_FinancialStatement ReportType_Annual,Result_FinancialStatement ReportType_Q1,Result_FinancialStatement ReportType_Q2,Result_FinancialStatement ReportType_Q3,label_date_5,label_high_5,label_low_5,label_date_10,label_high_10,label_low_10,label_date_20,label_high_20,label_low_20,high_low_5,high_low_10,high_low_20,center_5,center_10,center_20
0,2016-08-05,1301,-0.003837,-0.018868,0.0,-0.007634,0.010664,0.009811,0.015279,0.017334,0.015602,0.016786,0.014745,0.016258,0.020654,0.017166,0.01703,0.016979,-0.006116,-0.010956,-0.012408,-0.011321,-0.015038,-0.015038,0.0,0.0,0.007782,0.0,0.411765,0.011538,0.009721,0.008945,0.007279,0.010554,0.0,0.0,0.0,0.0,1.241122,1.997859,2.421053,0.451906,0.226257,0.023962,0.005422,0.001923,8900.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2016-08-15,0.02692,0.00385,2016-08-22,0.02692,0.0,2016-09-05,0.02692,0.0,0.02307,0.02692,0.02692,0.015385,0.01346,0.01346
1,2016-11-04,1301,-0.023082,-0.025298,-0.013533,0.033333,0.009818,0.010405,0.009007,0.010922,0.010838,0.014654,0.008587,0.008973,0.010579,0.011228,0.013337,0.014684,-0.01913,-0.019815,0.004634,-0.003242,-0.004676,-0.006463,0.0,0.0,0.0,-0.001111,0.35443,0.030033,0.009464,0.010687,0.009163,0.010824,0.0,0.0,0.0,0.0,1.226887,1.988898,2.286853,0.770658,0.221484,0.050254,0.011131,0.001854,31500.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2016-11-11,0.0,-0.03967,2016-11-18,0.0,-0.03967,2016-12-05,0.00779,-0.03967,0.03967,0.03967,0.04746,-0.019835,-0.019835,-0.01594
2,2017-02-17,1301,0.000354,0.029508,0.038207,0.021323,0.007324,0.005317,0.007095,0.007151,0.008334,0.008305,0.007371,0.007306,0.00714,0.007479,0.008712,0.009493,0.005122,0.03092,0.041907,-0.002816,-0.002816,-0.002816,0.019956,0.034216,0.036504,0.009286,0.815068,0.007785,0.006161,0.015958,0.015708,0.013607,0.006065,0.164167,0.051711,1.161518,0.355744,0.218663,0.167315,-0.142507,0.220017,0.095,0.020902,0.001769,26600.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2017-02-24,0.04246,0.00177,2017-03-03,0.07749,0.00177,2017-03-17,0.13588,0.00177,0.04069,0.07572,0.13411,0.022115,0.03963,0.068825
3,2017-05-11,1301,0.016579,0.035775,0.009967,0.110705,0.012836,0.011138,0.01292,0.014204,0.012303,0.011301,0.012153,0.01357,0.015501,0.013631,0.011806,0.010868,0.018153,0.036481,0.035992,0.0,0.0,0.0,0.013988,0.022008,0.04171,0.013333,0.777778,0.022368,0.010969,0.015738,0.015679,0.010238,0.043839,0.53021,0.318053,0.346304,0.05681,0.074402,0.078458,0.114781,0.260712,0.095388,0.024869,0.019737,69900.0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2017-05-18,0.0148,-0.02632,2017-05-25,0.04441,-0.02632,2017-06-08,0.04441,-0.02632,0.04112,0.07073,0.07073,-0.00576,0.009045,0.009045
4,2017-08-04,1301,0.033142,0.0304,0.052288,0.111878,0.015734,0.012988,0.010828,0.012113,0.012486,0.01354,0.009493,0.00997,0.01127,0.012443,0.012819,0.013366,0.027441,0.034239,0.048915,0.0,0.0,0.0,0.003236,0.004862,0.016393,0.00625,0.745763,0.040373,0.011913,0.017223,0.019404,0.013264,0.08884,1.09636,1.902632,0.368421,3.398002,3.085802,2.626473,2.580902,0.23794,0.029499,0.007019,0.018634,106000.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2017-08-14,0.01863,-0.01553,2017-08-21,0.04814,-0.01553,2017-09-04,0.0559,-0.01553,0.03416,0.06367,0.07143,0.00155,0.016305,0.020185


In [8]:
# 目的変数
Y_cols = ['label_high_5', 'label_low_5', 'label_high_10', 'label_low_10', 'label_high_20', 'label_low_20', 
          'high_low_5', 'high_low_10', 'high_low_20', 'center_5', 'center_10', 'center_20']

In [9]:
#XYXY.to_csv('../data/XY3.csv', index=False)

In [10]:
XY.head()

Unnamed: 0,base_date,Local Code,log_R,return_5,return_25,return_75,HV_5,HV_10,HV_25,HV_50,HV_75,HV_100,MA20_HV5,MA20_HV10,MA20_HV25,MA20_HV50,MA20_HV75,MA20_HV100,MADR5,MADR25,MADR75,MXDR5,MXDR10,MXDR20,MNDR5,MNDR10,MNDR20,RNDR,RSI,H-L_C,MA20_H-L_C,OperatingIncome_NetSales,OrdinaryIncome_NetSales,NetIncome_NetSales,NetSales_Growth,OperatingIncome_Growth,OrdinaryIncome_Growth,NetIncome_Growth,Forecast_NetSales_Growth,Forecast_OperatingIncome_Growth,Forecast_OrdinaryIncome_Growth,Forecast_NetIncome_Growth,Capital_Ratio,ROE,ROA,Dividend_Yeild,EndOfDayQuote Volume,CF_Operating_pn_-1.0,CF_Operating_pn_0.0,CF_Operating_pn_1.0,CF_Financing_pn_-1.0,CF_Financing_pn_0.0,CF_Financing_pn_1.0,CF_Investing_pn_-1.0,CF_Investing_pn_0.0,CF_Investing_pn_1.0,17_Sector_1,17_Sector_2,17_Sector_3,17_Sector_4,17_Sector_5,17_Sector_6,17_Sector_7,17_Sector_8,17_Sector_9,17_Sector_10,17_Sector_11,17_Sector_12,17_Sector_13,17_Sector_14,17_Sector_15,17_Sector_16,17_Sector_17,Result_FinancialStatement ReportType_Annual,Result_FinancialStatement ReportType_Q1,Result_FinancialStatement ReportType_Q2,Result_FinancialStatement ReportType_Q3,label_date_5,label_high_5,label_low_5,label_date_10,label_high_10,label_low_10,label_date_20,label_high_20,label_low_20,high_low_5,high_low_10,high_low_20,center_5,center_10,center_20
0,2016-08-05,1301,-0.003837,-0.018868,0.0,-0.007634,0.010664,0.009811,0.015279,0.017334,0.015602,0.016786,0.014745,0.016258,0.020654,0.017166,0.01703,0.016979,-0.006116,-0.010956,-0.012408,-0.011321,-0.015038,-0.015038,0.0,0.0,0.007782,0.0,0.411765,0.011538,0.009721,0.008945,0.007279,0.010554,0.0,0.0,0.0,0.0,1.241122,1.997859,2.421053,0.451906,0.226257,0.023962,0.005422,0.001923,8900.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2016-08-15,0.02692,0.00385,2016-08-22,0.02692,0.0,2016-09-05,0.02692,0.0,0.02307,0.02692,0.02692,0.015385,0.01346,0.01346
1,2016-11-04,1301,-0.023082,-0.025298,-0.013533,0.033333,0.009818,0.010405,0.009007,0.010922,0.010838,0.014654,0.008587,0.008973,0.010579,0.011228,0.013337,0.014684,-0.01913,-0.019815,0.004634,-0.003242,-0.004676,-0.006463,0.0,0.0,0.0,-0.001111,0.35443,0.030033,0.009464,0.010687,0.009163,0.010824,0.0,0.0,0.0,0.0,1.226887,1.988898,2.286853,0.770658,0.221484,0.050254,0.011131,0.001854,31500.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2016-11-11,0.0,-0.03967,2016-11-18,0.0,-0.03967,2016-12-05,0.00779,-0.03967,0.03967,0.03967,0.04746,-0.019835,-0.019835,-0.01594
2,2017-02-17,1301,0.000354,0.029508,0.038207,0.021323,0.007324,0.005317,0.007095,0.007151,0.008334,0.008305,0.007371,0.007306,0.00714,0.007479,0.008712,0.009493,0.005122,0.03092,0.041907,-0.002816,-0.002816,-0.002816,0.019956,0.034216,0.036504,0.009286,0.815068,0.007785,0.006161,0.015958,0.015708,0.013607,0.006065,0.164167,0.051711,1.161518,0.355744,0.218663,0.167315,-0.142507,0.220017,0.095,0.020902,0.001769,26600.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2017-02-24,0.04246,0.00177,2017-03-03,0.07749,0.00177,2017-03-17,0.13588,0.00177,0.04069,0.07572,0.13411,0.022115,0.03963,0.068825
3,2017-05-11,1301,0.016579,0.035775,0.009967,0.110705,0.012836,0.011138,0.01292,0.014204,0.012303,0.011301,0.012153,0.01357,0.015501,0.013631,0.011806,0.010868,0.018153,0.036481,0.035992,0.0,0.0,0.0,0.013988,0.022008,0.04171,0.013333,0.777778,0.022368,0.010969,0.015738,0.015679,0.010238,0.043839,0.53021,0.318053,0.346304,0.05681,0.074402,0.078458,0.114781,0.260712,0.095388,0.024869,0.019737,69900.0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2017-05-18,0.0148,-0.02632,2017-05-25,0.04441,-0.02632,2017-06-08,0.04441,-0.02632,0.04112,0.07073,0.07073,-0.00576,0.009045,0.009045
4,2017-08-04,1301,0.033142,0.0304,0.052288,0.111878,0.015734,0.012988,0.010828,0.012113,0.012486,0.01354,0.009493,0.00997,0.01127,0.012443,0.012819,0.013366,0.027441,0.034239,0.048915,0.0,0.0,0.0,0.003236,0.004862,0.016393,0.00625,0.745763,0.040373,0.011913,0.017223,0.019404,0.013264,0.08884,1.09636,1.902632,0.368421,3.398002,3.085802,2.626473,2.580902,0.23794,0.029499,0.007019,0.018634,106000.0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2017-08-14,0.01863,-0.01553,2017-08-21,0.04814,-0.01553,2017-09-04,0.0559,-0.01553,0.03416,0.06367,0.07143,0.00155,0.016305,0.020185


In [11]:
# 欠損値はゼロ埋め
XY.fillna(0, inplace=True)
print('null num;', XY.isnull().sum().sum())

# 開始時点と終了時点
print('start:', XY.base_date.min(), ' end:', XY.base_date.max())

#XY.describe()

null num; 0
start: 2016-06-29 00:00:00  end: 2020-12-29 00:00:00


In [12]:
# データの分割期間の設定
TRAIN_END = "2017-11-30"
VAL_START = "2018-01-01"
VAL_END = "2018-12-01"
TEST_START = "2019-01-01"

In [13]:
# 不要な列を削除
XY.drop(['Local Code', 'label_date_5', 'label_date_10', 'label_date_20'], axis=1, inplace=True)

# データを分割する
XY.set_index('base_date', inplace=True)
train_XY = XY.loc[:TRAIN_END]
val_XY = XY.loc[VAL_START:VAL_END]
test_XY = XY.loc[TEST_START:]

# 説明変数と目的変数を分ける
train_X = train_XY.drop(Y_cols, axis=1).copy()
train_Y = train_XY.loc[:, Y_cols]
val_X = val_XY.drop(Y_cols, axis=1).copy()
val_Y = val_XY.loc[:, Y_cols]
test_X = test_XY.drop(Y_cols, axis=1).copy()
test_Y = test_XY.loc[:, Y_cols]

## モデル予測

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score, accuracy_score
from scipy.stats import spearmanr
import pickle, os

SEED = 0
np.random.seed(SEED)

# モデルの保存先
save_dir = '../models/ML2'

# モデルを定義
models = {
    'RandomForest':RandomForestRegressor(random_state=SEED),
    'GradientBoosting':GradientBoostingRegressor(random_state=SEED),
}

In [15]:
# 標準化処理
sc = StandardScaler()
sc.fit(train_X)
train_X = sc.transform(train_X)
val_X = sc.transform(val_X)
test_X = sc.transform(test_X)

In [16]:
# テストデータ
scores_val = {}
scores_test = {}

# 予測対象を絞る
Y_cols = ['label_high_20', 'label_low_20', 'high_low_20', 'center_20']

# 予測結果格納
predict_val = {}
predict_test = {}

for model_name, _ in models.items():
    for label in Y_cols:
        # load model
        load_model_name = model_name + '_' + label + '.pickle'
        load_path = os.path.join(save_dir, load_model_name)
        with open(load_path, mode='rb') as fp:
            model = pickle.load(fp)

        # predict val
        predict_val[(label, model_name)] = model.predict(val_X)
    
        # predict test 
        predict_test[(label, model_name)] = model.predict(test_X)

In [17]:
# 集計
# df作成
df_predict_val = pd.Series(predict_val).unstack()

# Center +/- Width((H-L)/2)
gb_base_Center_H20 = df_predict_val['GradientBoosting']['center_20'] + df_predict_val['GradientBoosting']['high_low_20'] / 2
gb_base_Center_L20 = df_predict_val['GradientBoosting']['center_20'] - df_predict_val['GradientBoosting']['high_low_20'] / 2

rf_base_Center_H20 = df_predict_val['RandomForest']['center_20'] + df_predict_val['RandomForest']['high_low_20'] / 2
rf_base_Center_L20 = df_predict_val['RandomForest']['center_20'] - df_predict_val['RandomForest']['high_low_20'] / 2

# 余分な行を削除
df_predict_val.drop(['center_20', 'high_low_20'], inplace=True)

# データを追加
df_predict_val['GB_base_Center'] = None
df_predict_val['GB_base_Center']['label_high_20'] = gb_base_Center_H20
df_predict_val['GB_base_Center']['label_low_20'] = gb_base_Center_L20

df_predict_val['RF_base_Center'] = None
df_predict_val['RF_base_Center']['label_high_20'] = rf_base_Center_H20
df_predict_val['RF_base_Center']['label_low_20'] = rf_base_Center_L20

# Brend model
df_predict_val['Brend_GBRF'] = (df_predict_val['GradientBoosting'] + df_predict_val['RandomForest']) / 2
df_predict_val['Brend_GB_Center'] = (df_predict_val['GradientBoosting'] + df_predict_val['GB_base_Center']) / 2
df_predict_val['Brend_RF_Center'] = (df_predict_val['RandomForest'] + df_predict_val['RF_base_Center']) / 2
df_predict_val['Brend_Center'] = (df_predict_val['GB_base_Center'] + df_predict_val['RF_base_Center']) / 2
df_predict_val['Brend_ALL'] = (df_predict_val['GradientBoosting'] + df_predict_val['GB_base_Center'] + \
                               df_predict_val['RandomForest'] + df_predict_val['RF_base_Center']) / 4


# スコア計算
scores_val = {}

for model_name in df_predict_val.columns:
    for label in df_predict_val.index:
        scores_val[(label, model_name, 'MSE')] = mean_squared_error(val_Y[label], df_predict_val[model_name][label])
        scores_val[(label, model_name, 'MAE')] = mean_absolute_error(val_Y[label], df_predict_val[model_name][label])
        scores_val[(label, model_name, 'MedAE')] = median_absolute_error(val_Y[label], df_predict_val[model_name][label])
        scores_val[(label, model_name, 'R2')] = r2_score(val_Y[label], df_predict_val[model_name][label])
        scores_val[(label, model_name, 'Accuracy')] = accuracy_score(np.sign(val_Y[label]), np.sign(df_predict_val[model_name][label]))
        scores_val[(label, model_name, 'Corr')] = np.corrcoef(val_Y[label], df_predict_val[model_name][label])[0, 1]
        scores_val[(label, model_name, 'SpearmanCorr')] = spearmanr(val_Y[label], df_predict_val[model_name][label])[0]

In [18]:
# 集計
# df作成
df_predict_test = pd.Series(predict_test).unstack()

# Center +/- Width((H-L)/2)
gb_base_Center_H20 = df_predict_test['GradientBoosting']['center_20'] + df_predict_test['GradientBoosting']['high_low_20'] / 2
gb_base_Center_L20 = df_predict_test['GradientBoosting']['center_20'] - df_predict_test['GradientBoosting']['high_low_20'] / 2

rf_base_Center_H20 = df_predict_test['RandomForest']['center_20'] + df_predict_test['RandomForest']['high_low_20'] / 2
rf_base_Center_L20 = df_predict_test['RandomForest']['center_20'] - df_predict_test['RandomForest']['high_low_20'] / 2

# 余分な行を削除
df_predict_test.drop(['center_20', 'high_low_20'], inplace=True)

# データを追加
df_predict_test['GB_base_Center'] = None
df_predict_test['GB_base_Center']['label_high_20'] = gb_base_Center_H20
df_predict_test['GB_base_Center']['label_low_20'] = gb_base_Center_L20

df_predict_test['RF_base_Center'] = None
df_predict_test['RF_base_Center']['label_high_20'] = rf_base_Center_H20
df_predict_test['RF_base_Center']['label_low_20'] = rf_base_Center_L20

# Brend model
df_predict_test['Brend_GBRF'] = (df_predict_test['GradientBoosting'] + df_predict_test['RandomForest']) / 2
df_predict_test['Brend_GB_Center'] = (df_predict_test['GradientBoosting'] + df_predict_test['GB_base_Center']) / 2
df_predict_test['Brend_RF_Center'] = (df_predict_test['RandomForest'] + df_predict_test['RF_base_Center']) / 2
df_predict_test['Brend_Center'] = (df_predict_test['GB_base_Center'] + df_predict_test['RF_base_Center']) / 2
df_predict_test['Brend_ALL'] = (df_predict_test['GradientBoosting'] + df_predict_test['GB_base_Center'] + \
                               df_predict_test['RandomForest'] + df_predict_test['RF_base_Center']) / 4


# スコア計算
scores_test = {}

for model_name in df_predict_test.columns:
    for label in df_predict_test.index:
        scores_test[(label, model_name, 'MSE')] = mean_squared_error(test_Y[label], df_predict_test[model_name][label])
        scores_test[(label, model_name, 'MAE')] = mean_absolute_error(test_Y[label], df_predict_test[model_name][label])
        scores_test[(label, model_name, 'MedAE')] = median_absolute_error(test_Y[label], df_predict_test[model_name][label])
        scores_test[(label, model_name, 'R2')] = r2_score(test_Y[label], df_predict_test[model_name][label])
        scores_test[(label, model_name, 'Accuracy')] = accuracy_score(np.sign(test_Y[label]), np.sign(df_predict_test[model_name][label]))
        scores_test[(label, model_name, 'Corr')] = np.corrcoef(test_Y[label], df_predict_test[model_name][label])[0, 1]
        scores_test[(label, model_name, 'SpearmanCorr')] = spearmanr(test_Y[label], df_predict_test[model_name][label])[0]

In [19]:
result_val = pd.Series(scores_val).unstack()
result_val.to_csv('../result/ML_result_val_Final.csv')
result_val

Unnamed: 0,Unnamed: 1,Accuracy,Corr,MAE,MSE,MedAE,R2,SpearmanCorr
label_high_20,Brend_ALL,0.777678,0.315848,0.085234,0.017017,0.060235,0.035244,0.310027
label_high_20,Brend_Center,0.777603,0.322199,0.084865,0.016881,0.059984,0.042917,0.311082
label_high_20,Brend_GBRF,0.777678,0.299928,0.085951,0.017377,0.060522,0.014824,0.303503
label_high_20,Brend_GB_Center,0.777603,0.294393,0.084054,0.01701,0.059414,0.035631,0.29073
label_high_20,Brend_RF_Center,0.777753,0.310008,0.087705,0.017712,0.061807,-0.004164,0.312493
label_high_20,GB_base_Center,0.777528,0.294136,0.084014,0.017048,0.059261,0.033463,0.294211
label_high_20,GradientBoosting,0.777678,0.278383,0.084444,0.017326,0.058989,0.017681,0.284357
label_high_20,RF_base_Center,0.777753,0.318568,0.087058,0.01745,0.061659,0.010695,0.311194
label_high_20,RandomForest,0.777753,0.286996,0.089223,0.018453,0.062423,-0.04621,0.301442
label_low_20,Brend_ALL,0.885611,0.354078,0.058945,0.00667,0.042172,0.026069,0.357682


In [20]:
result_test = pd.Series(scores_test).unstack()
result_test.to_csv('../result/ML_result_test_Final.csv')
result_test

Unnamed: 0,Unnamed: 1,Accuracy,Corr,MAE,MSE,MedAE,R2,SpearmanCorr
label_high_20,Brend_ALL,0.815525,0.266071,0.093415,0.026411,0.06228,0.028921,0.306234
label_high_20,Brend_Center,0.815525,0.274706,0.093003,0.026075,0.062463,0.041288,0.305589
label_high_20,Brend_GBRF,0.815525,0.249966,0.094278,0.027115,0.062376,0.003034,0.301394
label_high_20,Brend_GB_Center,0.815454,0.240751,0.092133,0.02688,0.06079,0.011695,0.299274
label_high_20,Brend_RF_Center,0.815525,0.267592,0.096338,0.026967,0.064413,0.008493,0.296834
label_high_20,GB_base_Center,0.815348,0.254638,0.091725,0.026339,0.060813,0.03158,0.300062
label_high_20,GradientBoosting,0.815454,0.216065,0.092921,0.028114,0.060446,-0.033702,0.295687
label_high_20,RF_base_Center,0.81549,0.270228,0.095953,0.026788,0.064486,0.015052,0.294146
label_high_20,RandomForest,0.815525,0.253178,0.097822,0.027743,0.064819,-0.020057,0.286386
label_low_20,Brend_ALL,0.859857,0.137112,0.070202,0.011376,0.04374,-0.043543,0.16374


<メモ>
* 最高値の予測はBrend_ALLが良い
* 最安値はBrend_ALLかGradientBosstingのどちらか  
  ⇒汎化性能を考慮してBrend_Allを採用する

DeepLearningモデルでは、同様に平均値予測と幅の予測もモデルに組み入れるとする