In [43]:
import pandas as pd
import numpy as np

In [44]:
# get high vertexs index and price, whose highest is higher than adjacent 4
# days highest price, whose lowest is higher than adjacent 4 days 
# lowest price

# get low vertexs index and price, whose low is loser than adjacent 4 days lowest
# price, whose high is lower than adjacent 4 days highest price 
def vertex(dates,lows,highs,closes):
    hvertexs = []
    lvertexs = []
    for i in range(5,len(highs)-5):
        if highs[i] == max(highs[i-4:i+4]):
            if lows[i] > min(lows[i-4:i]) and lows[i] > min(lows[i+1:i+5]):
                hvertexs.append([i,dates[i],highs[i]])
        if lows[i] == min(lows[i-4:i+4]):
            if highs[i] < max(highs[i-4:i]) and highs[i] < max(highs[i+1:i+5]):
                lvertexs.append([i,dates[i],lows[i]])
    return hvertexs,lvertexs

In [45]:
#define necessary indicators

# first time breakthrough short term wandering area 
def first_bt(data): #data is a dataframe including low, high, close prices
    rows_num = len(data)
    if rows_num < 30:   #first_bt indicator is only for record with min 30 days
        return data
    else:
        #get seperate lists of date, high, low and close prices        
        dates = data['date']
        lows = data['low']
        highs = data['high']
        closes = data['close']
        #add a column 'first breakthrough' for data and initilize value with False
        data['first breakthrough'] = 0
        hvertexs, lvertexs = vertex(dates,lows,highs,closes)
        #check each record, add indicator 'first breakthrough'
        for i in range(0,rows_num):
            today = dates[i]
        #this record must have highest high and close price in 10 days
            if i > 10 and highs[i] >= max(highs[i-10:i]) and closes[i] >= max(closes[i-10:i]):
                #find out previous index, previous 2nd high vertex must be 
                #exist and the price is higher than previous high vertex
                prevh_index = -1 
                for j in range(len(hvertexs)-1,-1,-1):
                    #strtoday = time.strptime(today,'%Y-%m-%d')
                    #if strtoday > time.strptime(hvertexs[j][0],'%Y-%m-%d'):
                    if today > hvertexs[j][1]:
                        if j > 0  and hvertexs[j][2] < hvertexs[j-1][2]:
                            prevh_date = hvertexs[j][1]
                            prevh_index = hvertexs[j][0]
                        break
                #high must be higher than all from previou high vertex to now
                if prevh_index > 10 and i > prevh_index + 3 and highs[i] >= max(highs[prevh_index:i]) and closes[i] >= max(closes[prevh_index:i]) and highs[prevh_index] > max(highs[prevh_index+1:i]):
                    data['first breakthrough'][i] = 1
    return data

In [46]:
# input rise percents, output the number of days price is rising within
# recent n days 
def maxpricevol(rise_percents,n,volumns):
    counts = []
    maxvol10 = []
    for i in range(n,len(volumns)):
        count = 0
        for j in range(i-n,i):
            if rise_percents[j] > 0:
                  count = count + 1
        counts.append(count)
        max10 = 0
        if volumns[i] > max(volumns[i-10:i]):
            max10=1
        maxvol10.append(max10)
    return counts, maxvol10

In [47]:
# for each day if buying with open price, whether the profit reaches 5% in 
# later 5 days and loss is less than 5%
def profit_and_fromlow(open, close, high, low):
    profits = [99999] * len(open)
    fromlow = [99999] * len(open)
    for i in range(0,len(open)-6):
        if i > 4:
            fromlow[i] = round((open[i]-min(low[i-4:i]))/open[i],5) 
        else:
            fromlow[i] = round((open[i]-min(low[:]))/open[i],5) 
                
        for j in range(i+1,len(open)):
            if low[j] < open[i] * 0.95:
                profits[i] = 0
                break
            else:
                if high[j] > open[i] * 1.05:
                    profits[i] = 1
                    break
        
    return profits,fromlow 

In [48]:
# combine all first break data from different stock files into 1 file

train_list = []
test_list = []

for stock_number in ['TSLA','APPL','GOOGL','FB','BABA']:
    
    
    #prepare input data for decision tree with necessary indicators
    strnum = str(stock_number)
    print ('for stock ' + strnum + ':')
    stocks = pd.read_csv(strnum + '.csv')
    stocks.rename(columns={'Unnamed: 0':'Date'},inplace=True)
    #print (stocks)
    #stockdata = stocks.ix[:,['Date', strnum +'_High',strnum +'_Low',strnum +'_Close',strnum +'_Open',strnum +'_Volumn']]
    
    # remove records with space value of price
    stockdata = stocks.dropna()
    if len(stockdata.index) < 800:
        continue
        
    #reset indexes
    stockdata.columns = ['date','open','high','low','close','volumn']
    stockdata.index = range(len(stockdata.index))

    #add first time up indicator
    stockdata = first_bt(stockdata)

    #add rise percentage indicator rise percent
    rise = stockdata.close.diff()
    stockdata['rise percent'] = round(rise/stockdata['close'],5)

    #add 20 recent days up indicator upcount_20
    if len(stockdata['rise percent']) > 21:
        upcount_20, maxvol_10 = maxpricevol(stockdata['rise percent'],20,stockdata['volumn'])
        stockdata['upcount 20'] = [0] * 20 + upcount_20
        stockdata['maxvol 10'] = [0] * 20 + maxvol_10
    # add profit percent indicator profit, open to previous lowest price percentage indicator from_low
    if len(stockdata['open']) > 6:
        profit_fromlow = profit_and_fromlow(stockdata['open'],stockdata['close'],stockdata['high'],stockdata['low'])
        stockdata['profit'],stockdata['fromlow'] = profit_fromlow
    # add breakprofit indicator profit
    stockdata['breakprofit'] = [0] * len(stockdata['profit'])
    stockdata['breakfromlow'] = [0] * len(stockdata['profit'])
    for i in range(0,len(stockdata['profit'])):
        if stockdata['first breakthrough'][i] == 1:
            if stockdata['profit'][i] == 1:
                stockdata['breakprofit'][i] = 1
            if stockdata['fromlow'][i] < 0.05:
                stockdata['breakfromlow'][i] = 1
    
    stockdata = stockdata.replace([' ', np.inf, -np.inf,99999], np.nan).ix[20:,:]
    stockdata.to_csv('new' + strnum + '.csv')
    data = pd.read_csv('new' + strnum + '.csv')
    data = data.dropna()

    firstbreakdata = data.loc[data['first breakthrough'].isin([1])]
    firstbreakdata['stocksymbol'] = stock_number

    if len(firstbreakdata) < 4:
        continue

    train_list.append(firstbreakdata.iloc[:-3,:])
    test_list.append(firstbreakdata.iloc[-3:,:])
   
total_train = pd.concat(train_list)
total_test = pd.concat(test_list)

total_train.to_csv('traindata_us.csv')
total_test.to_csv('testdata_us.csv')


for stock TSLA:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing

for stock APPL:
for stock GOOGL:
for stock FB:
for stock BABA:
