# 使用Gradient Boosting Regressor 來預測ETF價格

這個使用了GBR 來預測 18檔 ETF的價格
一次要預測一週的價格

這裡的思路是以下模式：
1. 將每日的開盤、收盤、最高、最低、交易量建立各自的特徵
2. 根據這些特徵去預測隔天的開盤、收盤、最高、最低、交易量
3. 反覆重複以上過程去預測連續日期的價格

運作分四個區塊：
1. 讀取資料
2. 特徵工程
3. 用演算法進行學習
4. 執行預測

In [236]:
#import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%pylab inline

#import machine learning model
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

#pandas output setting
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('expand_frame_repr', True)

Populating the interactive namespace from numpy and matplotlib


# Read the data

In [237]:
#set the path of file
tetfp_path = 'C:/Users/AdamChang/Documents/Documents/PFD/Porfolio/T-Brain ETF Compitition/TBrain_Round2_DataSet_20180331/tetfp.csv'

# read the data
tetfp = pd.read_csv(tetfp_path, encoding='big5',
                    names=['Code','Date','Company','Open','High','Low','Close','Volumn'],skiprows=1)

In [238]:
tetfp.head()

Unnamed: 0,Code,Date,Company,Open,High,Low,Close,Volumn
0,50,20130102,元大台灣50,54.0,54.65,53.9,54.4,16487
1,50,20130103,元大台灣50,54.9,55.05,54.65,54.85,29020
2,50,20130104,元大台灣50,54.85,54.85,54.4,54.5,9837
3,50,20130107,元大台灣50,54.55,54.55,53.9,54.25,8910
4,50,20130108,元大台灣50,54.0,54.2,53.65,53.9,12507


# Data Preprocessing

In [240]:
def Preprocess(tetfp1):
    #Change the Date type
    tetfp1['Date'] = pd.to_datetime(tetfp1['Date'],format='%Y%m%d')

    #Change the Code datatype
    tetfp1['Code'] = tetfp1['Code'].astype('object')

    #Process the datatype of the volumn data
    for i in range(tetfp1['Volumn'].count()):
        a = str(tetfp1.loc[i,'Volumn']).split(',')
        b=''
        for j in a:
            b=b+j
        b = b.lstrip()
        b = float(b)
        tetfp1.loc[i,'Volumn']=b
    tetfp1['Volumn'] = tetfp1['Volumn'].astype('float64')

    #Get the date part of the date
    tetfp1['Year'] = tetfp1['Date'].dt.year
    tetfp1['Month'] = tetfp1['Date'].dt.month
    tetfp1['Day'] = tetfp1['Date'].dt.day
    tetfp1['Dayofwork'] = tetfp1['Date'].dt.dayofweek
    tetfp1['Week'] = tetfp1['Date'].dt.week

    #Get the season
    tetfp1['Season'] = 'nan'
    for i in range(tetfp1.Season.count()):
        if tetfp1.loc[i,'Month'] in [1,2,3]:
            tetfp1.loc[i,'Season'] = 1
        elif tetfp1.loc[i,'Month'] in [4,5,6]:
            tetfp1.loc[i,'Season'] = 2
        elif tetfp1.loc[i,'Month'] in [7,8,9]:
            tetfp1.loc[i,'Season'] = 3
        else:
            tetfp1.loc[i,'Season'] = 4

    #Shift the date of price
    tetfp1['Open1DayShift'] = tetfp1['Open'].shift(1)
    tetfp1['High1DayShift'] = tetfp1['High'].shift(1)
    tetfp1['Low1DayShift'] = tetfp1['Low'].shift(1)
    tetfp1['Close1DayShift'] = tetfp1['Close'].shift(1)
    tetfp1['Volumn1DayShift'] = tetfp1['Volumn'].shift(1)

    #Calculate the range and fluctuation
    tetfp1['Close2DayShift'] = tetfp1['Close'].shift(2)
    tetfp1['Fluctuation'] = tetfp1['Close1DayShift'] - tetfp1['Close2DayShift']
    #Calculate the range yesterday
    tetfp1['YesterdayRange'] = tetfp1['High1DayShift'] - tetfp1['Low1DayShift']

    #fill the shift nan data
    tetfp1 = tetfp1.fillna(method = 'bfill')

    #Calculate the fluctuation
    tetfp1['RiseOrDown'] = 'nan'
    for i in range(tetfp1['Fluctuation'].count()):
        if tetfp1.loc[i,'Fluctuation']==0:
            tetfp1.loc[i,'RiseOrDown'] = 0
        elif tetfp1.loc[i,'Fluctuation']>0:
            tetfp1.loc[i,'RiseOrDown'] = 1
        else:
            tetfp1.loc[i,'RiseOrDown']=-1

    #Select history information
    pmax = tetfp1.loc[0,'Close1DayShift']
    pmin = tetfp1.loc[0,'Close1DayShift']
    tetfp1['HistoryMax'] = 'nan'
    tetfp1['HistoryMin'] = 'nan'
    for i in range(tetfp1['HistoryMax'].count()):
        if tetfp1.loc[i,'Close1DayShift'] > pmax:
            pmax = tetfp1.loc[i,'Close1DayShift']
        tetfp1.loc[i,'HistoryMax'] = pmax
        if tetfp1.loc[i,'Close1DayShift'] < pmin:
            pmin = tetfp1.loc[i,'Close1DayShift']
        tetfp1.loc[i,'HistoryMin'] = pmin

    #Calculate the history range
    tetfp1['HistoryRange'] = tetfp1['HistoryMax'] - tetfp1['HistoryMin']

    #Overview the history range
    tetfp1['HistoryRangeRise'] = 0
    for i in range(tetfp1['HistoryRange'].count()):
        if (i-1)>0:
            if tetfp1.loc[i,'HistoryRange']>tetfp1.loc[i-1,'HistoryRange']:
                tetfp1.loc[i,'HistoryRangeRise'] = 1

    #Overview the yesterday range
    tetfp1['YesterdayRangeRise'] = 0
    for i in range(tetfp1['YesterdayRange'].count()):
        if (i-1)>0:
            if tetfp1.loc[i,'YesterdayRange']>tetfp1.loc[i-1,'YesterdayRange']:
                tetfp1.loc[i,'YesterdayRangeRise'] = 1
            elif tetfp1.loc[i,'YesterdayRange']<tetfp1.loc[i-1,'YesterdayRange']:
                tetfp1.loc[i,'YesterdayRangeRise'] = -1

    #Calculate the volumn change
    tetfp1['Volumn2DayShift'] = tetfp1['Volumn'].shift(2)
    tetfp1['VolumnFluctuation'] = tetfp1['Volumn1DayShift'] - tetfp1['Volumn2DayShift']

    #Calculate the volumn rise or down
    tetfp1 = tetfp1.fillna(method='bfill')
    tetfp1['VolumnRise'] = 0
    for i in range(tetfp1['VolumnRise'].count()):
        if tetfp1.loc[i,'VolumnFluctuation'] > 0:
            tetfp1.loc[i,'VolumnRise'] = 1
        else:
            tetfp1.loc[i,'VolumnRise'] = -1

    #Moving Average - Open
    tetfp1['Open1DayShiftMA5'] = pd.rolling_mean(tetfp1['Open1DayShift'],5)
    tetfp1['Open1DayShiftMA10'] = pd.rolling_mean(tetfp1['Open1DayShift'],10)
    tetfp1['Open1DayShiftMA20'] = pd.rolling_mean(tetfp1['Open1DayShift'],20)
    tetfp1['Open1DayShiftMA60'] = pd.rolling_mean(tetfp1['Open1DayShift'],60)
    tetfp1['Open1DayShiftMA120'] = pd.rolling_mean(tetfp1['Open1DayShift'],120)

    #Moving Average - High
    tetfp1['High1DayShiftMA5'] = pd.rolling_mean(tetfp1['High1DayShift'],5)
    tetfp1['High1DayShiftMA10'] = pd.rolling_mean(tetfp1['High1DayShift'],10)
    tetfp1['High1DayShiftMA20'] = pd.rolling_mean(tetfp1['High1DayShift'],20)
    tetfp1['High1DayShiftMA60'] = pd.rolling_mean(tetfp1['High1DayShift'],60)
    tetfp1['High1DayShiftMA120'] = pd.rolling_mean(tetfp1['High1DayShift'],120)

    #Moving Average - Low
    tetfp1['Low1DayShiftMA5'] = pd.rolling_mean(tetfp1['Low1DayShift'],5)
    tetfp1['Low1DayShiftMA10'] = pd.rolling_mean(tetfp1['Low1DayShift'],10)
    tetfp1['Low1DayShiftMA20'] = pd.rolling_mean(tetfp1['Low1DayShift'],20)
    tetfp1['Low1DayShiftMA60'] = pd.rolling_mean(tetfp1['Low1DayShift'],60)
    tetfp1['Low1DayShiftMA120'] = pd.rolling_mean(tetfp1['Low1DayShift'],120)

    #Moving Average - Close
    tetfp1['Close1DayShiftMA5'] = pd.rolling_mean(tetfp1['Close1DayShift'],5)
    tetfp1['Close1DayShiftMA10'] = pd.rolling_mean(tetfp1['Close1DayShift'],10)
    tetfp1['Close1DayShiftMA20'] = pd.rolling_mean(tetfp1['Close1DayShift'],20)
    tetfp1['Close1DayShiftMA60'] = pd.rolling_mean(tetfp1['Close1DayShift'],60)
    tetfp1['Close1DayShiftMA120'] = pd.rolling_mean(tetfp1['Close1DayShift'],120)

    #Moving Average - Volumn
    tetfp1['Volumn1DayShiftMA5'] = pd.rolling_mean(tetfp1['Volumn1DayShift'],5)
    tetfp1['Volumn1DayShiftMA10'] = pd.rolling_mean(tetfp1['Volumn1DayShift'],10)
    tetfp1['Volumn1DayShiftMA20'] = pd.rolling_mean(tetfp1['Volumn1DayShift'],20)
    tetfp1['Volumn1DayShiftMA60'] = pd.rolling_mean(tetfp1['Volumn1DayShift'],60)
    tetfp1['Volumn1DayShiftMA120'] = pd.rolling_mean(tetfp1['Volumn1DayShift'],120)

    #Moving Sum - Volumn
    tetfp1['Volumn1DayShiftMS5'] = pd.rolling_sum(tetfp1['Volumn1DayShift'],5)
    tetfp1['Volumn1DayShiftMS10'] = pd.rolling_sum(tetfp1['Volumn1DayShift'],10)
    tetfp1['Volumn1DayShiftMS20'] = pd.rolling_sum(tetfp1['Volumn1DayShift'],20)
    tetfp1['Volumn1DayShiftMS60'] = pd.rolling_sum(tetfp1['Volumn1DayShift'],60)
    tetfp1['Volumn1DayShiftMS120'] = pd.rolling_sum(tetfp1['Volumn1DayShift'],120)

    #Fill the nan data
    tetfp1=tetfp1.fillna(method='bfill')

    #Change the datatype
    tetfp1['Code'] = tetfp1['Code'].astype('object')
    tetfp1['Month'] = tetfp1['Month'].astype('object')
    tetfp1['Dayofwork'] = tetfp1['Dayofwork'].astype('object')
    tetfp1['Season'] = tetfp1['Season'].astype('object')
    tetfp1['RiseOrDown'] = tetfp1['RiseOrDown'].astype('object')
    tetfp1['HistoryRangeRise'] = tetfp1['HistoryRangeRise'].astype('object')
    tetfp1['YesterdayRangeRise'] = tetfp1['YesterdayRangeRise'].astype('object')
    tetfp1['VolumnRise'] = tetfp1['VolumnRise'].astype('object')
    
    return tetfp1

# Machine Learning

In [241]:
def learning(tetfp1):
    #define prediction list
    keylst = ['Open','High','Low','Close','Volumn']
    
    #define prediction parameter
    OpenPrice = 0
    HighPrice = 0
    LowPrice = 0
    ClosePrice = 0
    Volumn = 0
    
    #select the column from the dataframe
    for key in keylst:
        responce = tetfp1[key]
        variable = tetfp1[['Month','Dayofwork','Season','Open1DayShift','High1DayShift',
                           'Low1DayShift', 'Close1DayShift', 'Volumn1DayShift',
               'Close2DayShift', 'Fluctuation', 'YesterdayRange', 'RiseOrDown',
               'HistoryMax', 'HistoryMin', 'HistoryRange', 'HistoryRangeRise',
               'YesterdayRangeRise', 'Volumn2DayShift', 'VolumnFluctuation',
               'VolumnRise', 'Open1DayShiftMA5', 'Open1DayShiftMA10',
               'Open1DayShiftMA20', 'Open1DayShiftMA60', 'Open1DayShiftMA120',
               'High1DayShiftMA5', 'High1DayShiftMA10', 'High1DayShiftMA20',
               'High1DayShiftMA60', 'High1DayShiftMA120', 'Low1DayShiftMA5',
               'Low1DayShiftMA10', 'Low1DayShiftMA20', 'Low1DayShiftMA60',
               'Low1DayShiftMA120', 'Close1DayShiftMA5', 'Close1DayShiftMA10',
               'Close1DayShiftMA20', 'Close1DayShiftMA60', 'Close1DayShiftMA120',
               'Volumn1DayShiftMA5', 'Volumn1DayShiftMA10', 'Volumn1DayShiftMA20',
               'Volumn1DayShiftMA60', 'Volumn1DayShiftMA120', 'Volumn1DayShiftMS5',
               'Volumn1DayShiftMS10', 'Volumn1DayShiftMS20', 'Volumn1DayShiftMS60',
               'Volumn1DayShiftMS120']]
        
        #use one hot encoding
        variable_dummy = pd.get_dummies(variable)
        
        #get the last data for prediction
        x = variable_dummy[:-1]
        px = variable_dummy[-1:]
        responce = responce[:-1]
        y = tetfp1[-1:]
        
        #seperate the training set and testing set
        xtrain,xtest,ytrain,ytest = train_test_split(x, responce)
        
        #define machine learning model
        gbr = GradientBoostingRegressor().fit(xtrain,ytrain)
        
        #assign different prediction
        if key == 'Open':
            OpenPrice = gbr.predict(px)
        if key == 'High':
            HighPrice = gbr.predict(px)
        if key == 'Low':
            LowPrice = gbr.predict(px)
        if key == 'Close':
            ClosePrice = gbr.predict(px)
        if key == 'Volumn':
            Volumn = gbr.predict(px)
        #print('Key:',key,'\t price:',gbr.predict(px))
        
    return OpenPrice, HighPrice, LowPrice, ClosePrice, Volumn

# Main Function

In [243]:
df.head()

Unnamed: 0,index,Code,Date,Company,Open,High,Low,Close,Volumn
0,16940,6208,20130102,富邦台50,30.78,31.16,30.77,31.02,79
1,16941,6208,20130103,富邦台50,31.29,31.3,31.21,31.28,61
2,16942,6208,20130104,富邦台50,31.25,31.25,31.03,31.03,48
3,16943,6208,20130107,富邦台50,30.85,30.95,30.78,30.95,17
4,16944,6208,20130108,富邦台50,30.75,30.77,30.6,30.73,15


In [247]:
#define the etf list and the prediction date
CodeData = tetfp['Code'].value_counts()
Codelst = [list(CodeData.index)]
Datelst =['20180430','20180502','20180503','20180504']

#start learning
for CodeKey in Codelst[0]:
    #get the training data
    df = tetfp[tetfp.Code == CodeKey].copy().reset_index()
    #define the date list and volume list for prediction
    Datelst2 = []
    Vol = []
    
    #define all prediction output
    Indexlst = [CodeData.loc[CodeKey]]
    Openlst = [df.loc[max(df.index),'Open']]
    Highlst = [df.loc[max(df.index),'High']]
    Lowlst = [df.loc[max(df.index),'Low']]
    Closelst = [df.loc[max(df.index),'Close']]
    Volumnlst = [str(df.loc[max(df.index),'Volumn'])] 
    
    #start prediction and learning
    for i in range(len(Datelst)):        
        Datelst2.append(Datelst[i])
        
        #for first prediction
        if i == 0:
            new = pd.DataFrame({'index':Indexlst,
                               'Code':[CodeKey],
                               'Date':Datelst2,
                               'Company':['predict'],
                               'Open':Openlst,
                               'High':Highlst,
                               'Low':Lowlst,
                               'Close':Closelst,
                                'Volumn':Volumnlst}
                               ,index=Indexlst)
        #for prediction from the second
        else:
            new = pd.DataFrame({'index':Indexlst,
                               'Code':[CodeKey],
                               'Date':Datelst2,
                               'Company':['predict'],
                               'Open':Openlst,
                               'High':Highlst,
                               'Low':Lowlst,
                               'Close':Closelst,
                                'Volumn':Vol}
                               ,index=Indexlst)

        tetfp_run = df
        tetfp_run = tetfp_run.append(new)
        
        #data preprocessing
        data = Preprocess(tetfp_run)
        if i == 0:
            Vol.append(str(data.loc[max(data.index),'Volumn']))
        
        #start learning
        O, H, L, C, V = learning(data)
        
        #store the output
        Indexlst.append(Indexlst[i]+1)
        Openlst.append(O[0])
        Highlst.append(H[0])
        Lowlst.append(L[0])
        Closelst.append(C[0])
        Vol.append(V[0])
        
    print('Code =',CodeKey)
    print('Close price is:',Closelst)

	Series.rolling(center=False,window=5).mean()
	Series.rolling(center=False,window=10).mean()
	Series.rolling(center=False,window=20).mean()
	Series.rolling(center=False,window=60).mean()
	Series.rolling(center=False,window=120).mean()
	Series.rolling(center=False,window=5).mean()
	Series.rolling(center=False,window=10).mean()
	Series.rolling(center=False,window=20).mean()
	Series.rolling(center=False,window=60).mean()
	Series.rolling(center=False,window=120).mean()
	Series.rolling(center=False,window=5).mean()
	Series.rolling(center=False,window=10).mean()
	Series.rolling(center=False,window=20).mean()
	Series.rolling(center=False,window=60).mean()
	Series.rolling(center=False,window=120).mean()
	Series.rolling(center=False,window=5).mean()
	Series.rolling(center=False,window=10).mean()
	Series.rolling(center=False,window=20).mean()
	Series.rolling(center=False,window=60).mean()
	Series.rolling(center=False,window=120).mean()
	Series.rolling(center=False,window=5).mean()
	Series.rollin

Code = 56
Close price is: [25.149999999999999, 25.197905531768576, 25.105450261746576, 25.157972571518854, 25.118924527116619]
Code = 6204
Close price is: [52.100000000000001, 52.16513082512899, 52.276534499656549, 52.216842881413967, 52.247126799774293]
Code = 51
Close price is: [32.109999999999999, 31.830856829046297, 32.085822053738788, 31.867167776633554, 31.968137555730401]
Code = 6208
Close price is: [46.0, 45.913084894532425, 46.080591697817695, 45.898887957994212, 46.156849075038501]
Code = 52
Close price is: [53.200000000000003, 53.601715536822802, 53.758953255389528, 53.823450675283937, 54.148436177470103]
Code = 53
Close price is: [34.200000000000003, 34.967027898807203, 34.182755872996523, 34.932740590145883, 34.435494641041082]
Code = 54
Close price is: [23.09, 23.12867931009686, 23.189181886393659, 23.192880481768562, 23.189874014988366]
Code = 55
Close price is: [17.039999999999999, 16.988064072621512, 16.958937122945226, 16.879376967618185, 16.944903898215927]
Code = 62