In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier)
from sklearn import (metrics, cross_validation, linear_model, preprocessing)
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
SEED = 42
def cv_loop_acc(X, y, model, N):
    mean_acc = 0.0
    for i in range(0,N,1):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=1.0/float(N), 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict(X_cv)
        acc = metrics.accuracy_score(y_cv, preds)
        #print acc
        mean_acc += acc
    return mean_acc/N

In [4]:
SEED = 42
def cv_loop_auc(X, y, model, N):
    mean_auc = 0.0
    for i in range(0,N,1):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
                                       X, y, test_size=1.0/float(N), 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.roc_auc_score(y_cv, preds)
        #print auc
        mean_auc += auc
    return mean_auc/N

In [32]:
SEED = 42
def latest_day_loop_acc(X_train,y_train,X_test,y_test,model):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = metrics.accuracy_score(y_test, preds)
    return acc

In [5]:
def read_csv(day_trade):
    data_up = []
    data_down = []
    path = '/home/bigdatas16/SGX-OrderBook-Tick-Data-Trading-Strategy-/Train_Test_Data/ML_data_2014'
    for j,i in enumerate(day_trade):
        for k in range(0,len(i),1):
            path_up = path + '_' + str(j+1) + '_' + str(i[k]) + '_' + 'UP' + '.csv'
            path_down = path + '_' + str(j+1) + '_' + str(i[k]) + '_' + 'DOWN' + '.csv'
            data_up.append(pd.read_csv(path_up))
            data_down.append(pd.read_csv(path_down))
            #print path_down
    return data_up,data_down

In [6]:
day_trade = [[2,3,6,7,8,10,13,14,15,16,17,20,21,22,23,24,27,28,29,30],\
             [7,10,11,12,13,17,18,19,21,24,25,26,27],\
             [3,4,5,6,7,10,11,13,14,17,18,19,20,24,25,26,27,31]]

In [7]:
data_2014_up, data_2014_down = read_csv(day_trade)

In [39]:
mean_five_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])+len(day_trade[2])-4,1):
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i],data_2014_up[i+1],\
                           data_2014_up[i+2]],axis = 0).reset_index(drop=True)
    data_train = data_train
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = pd.concat([data_2014_up[i+3]],axis = 0).reset_index(drop=True)
    data_test = data_test
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_five_day.append(acc)
np.mean(mean_five_day),np.std(mean_five_day)

0.697888888889
0.649666666667
0.646666666667
0.487333333333
0.650555555556
0.517222222222
0.547333333333
0.555444444444
0.525777777778
0.473444444444
0.544888888889
0.505111111111
0.541111111111
0.434111111111
0.538111111111
0.428666666667
0.503777777778
0.486222222222
0.550222222222
0.511666666667
0.571222222222
0.511444444444
0.372555555556
0.467777777778
0.524888888889
0.617777777778
0.504
0.570777777778
0.446222222222
0.543888888889
0.5
0.471222222222
0.500555555556
0.601222222222
0.630777777778
0.445666666667
0.584444444444
0.482333333333
0.503555555556
0.431
0.486222222222
0.660666666667
0.520111111111
0.523111111111
0.479555555556
0.537777777778


(0.52791304347826085, 0.068121882942348999)

In [35]:
mean_four_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])+len(day_trade[2])-3,1):
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i],data_2014_up[i+1]],\
                           axis = 0).reset_index(drop=True)
    data_train = data_train
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = pd.concat([data_2014_up[i+2]],axis = 0).reset_index(drop=True)
    data_test = data_test
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_four_day.append(acc)
np.mean(mean_four_day),np.std(mean_four_day)

(0.53753427895981087, 0.067884145649198563)

In [36]:
mean_three_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])+len(day_trade[2])-2,1):
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i]],\
                           axis = 0).reset_index(drop=True)
    data_train = data_train
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = pd.concat([data_2014_up[i+1]],axis = 0).reset_index(drop=True)
    data_test = data_test
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_three_day.append(acc)
np.mean(mean_three_day),np.std(mean_three_day)

0.660111111111
0.59
0.707222222222
0.594333333333
0.644111111111
0.467111111111
0.628555555556
0.506333333333
0.552777777778
0.499111111111
0.564888888889
0.508888888889
0.468333333333
0.513444444444
0.515444444444
0.409333333333
0.479444444444
0.525222222222
0.558333333333
0.489444444444
0.579444444444
0.556222222222
0.560444444444
0.498333333333
0.472111111111
0.304
0.562444444444
0.525222222222
0.528222222222
0.528666666667
0.586222222222
0.596222222222
0.500666666667
0.569888888889
0.514777777778
0.562555555556
0.529333333333
0.443777777778
0.594333333333
0.574888888889
0.503666666667
0.549555555556
0.576222222222
0.492888888889
0.518222222222
0.531777777778
0.551
0.547444444444


(0.53627083333333336, 0.064448474914871962)

In [37]:
mean_two_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])+len(day_trade[2])-1,1):
    data_train = pd.concat([data_2014_up[i-1]],axis = 0).reset_index(drop=True)
    data_train = data_train
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = pd.concat([data_2014_up[i]],axis = 0).reset_index(drop=True)
    data_test = data_test
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_two_day.append(acc)
np.mean(mean_two_day),np.std(mean_two_day)

0.649111111111
0.613888888889
0.580888888889
0.632222222222
0.595444444444
0.652333333333
0.466
0.609222222222
0.587111111111
0.560222222222
0.508
0.561444444444
0.481
0.546333333333
0.432888888889
0.546666666667
0.463555555556
0.482333333333
0.452777777778
0.527222222222
0.463666666667
0.744888888889
0.550888888889
0.482333333333
0.461111111111
0.570444444444
0.502222222222
0.456333333333
0.629222222222
0.5
0.400333333333
0.598111111111
0.551333333333
0.539666666667
0.531666666667
0.481111111111
0.574111111111
0.508777777778
0.432333333333
0.576
0.452
0.513888888889
0.541
0.522555555556
0.523111111111
0.536111111111
0.561888888889
0.498111111111
0.578111111111


(0.53530612244897957, 0.066883338491784516)

In [54]:
y_train

170    1.0
171    1.0
172    1.0
173    1.0
174    1.0
175    1.0
176    1.0
177    1.0
178    1.0
179    1.0
180    1.0
181    1.0
182    1.0
183    1.0
184    1.0
185    1.0
186    1.0
187    1.0
188    1.0
189    1.0
190    1.0
191    1.0
192    1.0
193    1.0
194    1.0
195    1.0
196    1.0
197    1.0
198    1.0
199    1.0
      ... 
440    1.0
441    1.0
442    1.0
443    1.0
444    1.0
445    1.0
446    1.0
447    1.0
448    1.0
449    1.0
450    1.0
451    1.0
452    1.0
453    1.0
454    1.0
455    1.0
456    1.0
457    1.0
458    1.0
459    1.0
460    1.0
461    1.0
462    1.0
463    1.0
464    1.0
465    1.0
466    1.0
467    1.0
468    1.0
469    1.0
Name: 0, dtype: float64

In [59]:
mean_min_day = []
i = 0
for i in range(0,7200,10):
    print i
    data_train = data_2014_up[0][i:i+1800]
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = data_2014_up[0][i+1800:i+1800+5]
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_min_day.append(acc)
np.mean(mean_min_day),np.std(mean_min_day)

0
1.0
10
1.0
20
1.0
30
1.0
40
1.0
50
1.0
60
1.0
70
1.0
80
1.0
90
1.0
100
1.0
110
1.0
120
1.0
130
1.0
140
1.0
150
1.0
160
1.0
170
1.0
180
1.0
190
1.0
200
1.0
210
1.0
220
1.0
230
1.0
240
1.0
250
1.0
260
1.0
270
1.0
280
1.0
290
1.0
300
1.0
310
1.0
320
1.0
330
1.0
340
1.0
350
1.0
360
1.0
370
1.0
380
1.0
390
1.0
400
1.0
410
0.6
420
1.0
430
1.0
440
1.0
450
1.0
460
0.0
470
0.0
480
0.4
490
1.0
500
1.0
510
1.0
520
1.0
530
1.0
540
0.0
550
0.2
560
0.8
570
0.0
580
1.0
590
1.0
600
1.0
610
0.4
620
0.4
630
1.0
640
1.0
650
1.0
660
1.0
670
1.0
680
1.0
690
1.0
700
1.0
710
1.0
720
1.0
730
1.0
740
1.0
750
1.0
760
1.0
770
1.0
780
1.0
790
1.0
800
1.0
810
1.0
820
1.0
830
1.0
840
1.0
850
1.0
860
1.0
870
1.0
880
1.0
890
1.0
900
1.0
910
1.0
920
1.0
930
1.0
940
1.0
950
1.0
960
0.2
970
0.0
980
0.2
990
0.0
1000
0.0
1010
0.0
1020
0.0
1030
0.0
1040
0.0
1050
0.4
1060
0.0
1070
0.0
1080
0.0
1090
0.4
1100
1.0
1110
1.0
1120
0.0
1130
0.0
1140
0.0
1150
0.0
1160
0.0
1170
0.0
1180
0.0
1190
0.0
1200
0.0
1210
0.0
1220
0.0
1230

(0.79555555555555568, 0.37680266309247368)

In [62]:
mean_min_day = []
i = 0
for i in range(0,9000-3600-10,10):
    print i
    data_train = data_2014_up[0][i:i+3600]
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = data_2014_up[0][i+3600:i+3600+10]
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_min_day.append(acc)
np.mean(mean_min_day),np.std(mean_min_day)

0
0.4
10
0.5
20
0.5
30
1.0
40
1.0
50
0.9
60
1.0
70
0.5
80
0.9
90
1.0
100
1.0
110
0.5
120
0.8
130
1.0
140
1.0
150
1.0
160
1.0
170
1.0
180
1.0
190
1.0
200
1.0
210
1.0
220
1.0
230
1.0
240
1.0
250
1.0
260
1.0
270
1.0
280
1.0
290
1.0
300
1.0
310
1.0
320
1.0
330
1.0
340
1.0
350
1.0
360
1.0
370
1.0
380
1.0
390
1.0
400
1.0
410
1.0
420
1.0
430
1.0
440
1.0
450
1.0
460
1.0
470
1.0
480
1.0
490
1.0
500
0.4
510
1.0
520
1.0
530
1.0
540
1.0
550
1.0
560
1.0
570
0.8
580
1.0
590
1.0
600
1.0
610
0.0
620
0.0
630
0.0
640
1.0
650
0.5
660
0.1
670
0.8
680
0.3
690
0.9
700
0.7
710
0.7
720
0.5
730
0.3
740
0.8
750
0.2
760
0.0
770
0.4
780
0.8
790
0.4
800
0.4
810
1.0
820
1.0
830
1.0
840
1.0
850
1.0
860
1.0
870
1.0
880
1.0
890
1.0
900
0.9
910
0.3
920
1.0
930
1.0
940
1.0
950
1.0
960
1.0
970
1.0
980
1.0
990
1.0
1000
0.9
1010
1.0
1020
1.0
1030
1.0
1040
1.0
1050
1.0
1060
1.0
1070
0.0
1080
0.4
1090
1.0
1100
0.5
1110
0.2
1120
0.0
1130
0.5
1140
0.7
1150
0.0
1160
0.0
1170
0.3
1180
0.0
1190
0.0
1200
0.0
1210
0.0
1220
0.0
1230

(0.68311688311688323, 0.42951490011187549)

In [63]:
mean_min_day = []
latest_min = 60 * 30
pred_sec = 10
for i in range(0,9000-latest_min-pred_sec,pred_sec):
    print i
    data_train = data_2014_up[0][i:i+latest_min]
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = data_2014_up[0][i+latest_min:i+latest_min+pred_sec]
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_min_day.append(acc)
np.mean(mean_min_day),np.std(mean_min_day)

0
1.0
10
1.0
20
1.0
30
1.0
40
1.0
50
1.0
60
1.0
70
1.0
80
1.0
90
1.0
100
1.0
110
1.0
120
1.0
130
1.0
140
1.0
150
1.0
160
1.0
170
1.0
180
1.0
190
1.0
200
1.0
210
1.0
220
1.0
230
1.0
240
1.0
250
1.0
260
1.0
270
1.0
280
1.0
290
1.0
300
1.0
310
1.0
320
1.0
330
1.0
340
1.0
350
1.0
360
1.0
370
1.0
380
1.0
390
1.0
400
1.0
410
0.8
420
1.0
430
1.0
440
1.0
450
0.8
460
0.0
470
0.0
480
0.7
490
1.0
500
1.0
510
1.0
520
1.0
530
0.6
540
0.0
550
0.6
560
0.5
570
0.2
580
1.0
590
0.6
600
1.0
610
0.2
620
0.2
630
1.0
640
1.0
650
1.0
660
1.0
670
1.0
680
1.0
690
1.0
700
1.0
710
1.0
720
1.0
730
1.0
740
1.0
750
1.0
760
1.0
770
1.0
780
1.0
790
1.0
800
1.0
810
1.0
820
1.0
830
1.0
840
1.0
850
1.0
860
1.0
870
1.0
880
1.0
890
1.0
900
1.0
910
1.0
920
1.0
930
1.0
940
1.0
950
1.0
960
0.2
970
0.5
980
0.1
990
0.0
1000
0.0
1010
0.0
1020
0.0
1030
0.0
1040
0.1
1050
0.2
1060
0.0
1070
0.0
1080
0.0
1090
0.7
1100
1.0
1110
0.6
1120
0.0
1130
0.0
1140
0.0
1150
0.0
1160
0.0
1170
0.0
1180
0.0
1190
0.0
1200
0.0
1210
0.0
1220
0.0
1230

(0.78525730180806663, 0.36904516761898992)

In [64]:
mean_min_day = []
latest_min = 60 * 40
pred_sec = 20
for i in range(0,9000-latest_min-pred_sec,pred_sec):
    print i
    data_train = data_2014_up[0][i:i+latest_min]
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = data_2014_up[0][i+latest_min:i+latest_min+pred_sec]
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_min_day.append(acc)
np.mean(mean_min_day),np.std(mean_min_day)

0
0.6
20
0.45
40
1.0
60
1.0
80
1.0
100
1.0
120
1.0
140
1.0
160
1.0
180
1.0
200
1.0
220
1.0
240
1.0
260
1.0
280
1.0
300
1.0
320
1.0
340
0.85
360
0.3
380
0.05
400
0.0
420
0.0
440
0.05
460
0.0
480
0.0
500
0.4
520
0.0
540
0.0
560
0.0
580
0.0
600
0.0
620
0.0
640
0.25
660
1.0
680
0.75
700
0.7
720
0.5
740
1.0
760
1.0
780
1.0
800
1.0
820
1.0
840
0.55
860
0.3
880
0.45
900
0.0
920
0.25
940
1.0
960
0.3
980
0.0
1000
0.85
1020
0.85
1040
0.8
1060
1.0
1080
0.4
1100
0.8
1120
1.0
1140
1.0
1160
1.0
1180
1.0
1200
1.0
1220
1.0
1240
1.0
1260
1.0
1280
1.0
1300
1.0
1320
1.0
1340
1.0
1360
1.0
1380
1.0
1400
1.0
1420
1.0
1440
1.0
1460
1.0
1480
1.0
1500
1.0
1520
1.0
1540
1.0
1560
1.0
1580
1.0
1600
1.0
1620
1.0
1640
1.0
1660
1.0
1680
1.0
1700
1.0
1720
1.0
1740
1.0
1760
1.0
1780
1.0
1800
0.9
1820
1.0
1840
0.3
1860
1.0
1880
1.0
1900
0.9
1920
0.95
1940
1.0
1960
0.8
1980
0.9
2000
0.5
2020
0.85
2040
0.5
2060
0.75
2080
1.0
2100
1.0
2120
0.8
2140
1.0
2160
1.0
2180
1.0
2200
0.95
2220
1.0
2240
1.0
2260
0.3
2280
0.6
2300
0

(0.71398176291793314, 0.39695146775457985)

In [67]:
mean_min_day = []
latest_min = 60 * 40
pred_sec = 5
for i in range(0,9000-latest_min-pred_sec,pred_sec):
    print i
    data_train = data_2014_up[0][i:i+latest_min]
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = data_2014_up[0][i+latest_min:i+latest_min+pred_sec]
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_min_day.append(acc)
np.mean(mean_min_day),np.std(mean_min_day)

0
1.0
5
1.0
10
0.4
15
0.0
20
0.4
25
0.0
30
0.8
35
1.0
40
1.0
45
1.0
50
1.0
55
1.0
60
1.0
65
1.0
70
1.0
75
1.0
80
1.0
85
1.0
90
1.0
95
1.0
100
1.0
105
1.0
110
1.0
115
1.0
120
1.0
125
1.0
130
1.0
135
1.0
140
1.0
145
1.0
150
1.0
155
1.0
160
1.0
165
1.0
170
1.0
175
1.0
180
1.0
185
1.0
190
1.0
195
1.0
200
1.0
205
1.0
210
1.0
215
1.0
220
1.0
225
1.0
230
1.0
235
1.0
240
1.0
245
1.0
250
1.0
255
1.0
260
1.0
265
1.0
270
1.0
275
1.0
280
1.0
285
1.0
290
1.0
295
1.0
300
1.0
305
1.0
310
1.0
315
1.0
320
1.0
325
1.0
330
1.0
335
1.0
340
1.0
345
1.0
350
1.0
355
0.6
360
0.2
365
0.4
370
0.0
375
1.0
380
0.2
385
0.0
390
0.0
395
0.4
400
0.0
405
0.0
410
0.0
415
0.0
420
0.0
425
0.0
430
0.0
435
0.0
440
0.0
445
0.2
450
0.4
455
0.0
460
0.0
465
0.0
470
0.0
475
0.0
480
0.0
485
0.0
490
0.0
495
0.8
500
1.0
505
0.6
510
0.0
515
0.0
520
0.0
525
0.0
530
0.0
535
0.0
540
0.0
545
0.0
550
0.0
555
0.0
560
0.0
565
0.0
570
0.0
575
0.0
580
0.0
585
0.0
590
0.0
595
0.0
600
0.0
605
0.0
610
0.0
615
0.0
620
0.0
625
0.0
630
0.2
635
1.

(0.73631539044730865, 0.41708262635617926)

In [68]:
mean_min_day = []
latest_min = 60 * 30
pred_sec = 5
for i in range(0,9000-latest_min-pred_sec,pred_sec):
    print i
    data_train = data_2014_up[0][i:i+latest_min]
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = data_2014_up[0][i+latest_min:i+latest_min+pred_sec]
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    model = linear_model.LogisticRegression()
    acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
    print acc
    mean_min_day.append(acc)
np.mean(mean_min_day),np.std(mean_min_day)

0
1.0
5
1.0
10
1.0
15
1.0
20
1.0
25
1.0
30
1.0
35
1.0
40
1.0
45
1.0
50
1.0
55
1.0
60
1.0
65
1.0
70
1.0
75
1.0
80
1.0
85
1.0
90
1.0
95
1.0
100
1.0
105
1.0
110
1.0
115
1.0
120
1.0
125
1.0
130
1.0
135
1.0
140
1.0
145
1.0
150
1.0
155
1.0
160
1.0
165
1.0
170
1.0
175
1.0
180
1.0
185
1.0
190
1.0
195
1.0
200
1.0
205
1.0
210
1.0
215
1.0
220
1.0
225
1.0
230
1.0
235
1.0
240
1.0
245
1.0
250
1.0
255
1.0
260
1.0
265
1.0
270
1.0
275
1.0
280
1.0
285
1.0
290
1.0
295
1.0
300
1.0
305
1.0
310
1.0
315
1.0
320
1.0
325
1.0
330
1.0
335
1.0
340
1.0
345
1.0
350
1.0
355
1.0
360
1.0
365
1.0
370
1.0
375
1.0
380
1.0
385
1.0
390
1.0
395
1.0
400
1.0
405
1.0
410
0.6
415
1.0
420
1.0
425
1.0
430
1.0
435
1.0
440
1.0
445
1.0
450
1.0
455
0.6
460
0.0
465
0.0
470
0.0
475
0.0
480
0.4
485
1.0
490
1.0
495
1.0
500
1.0
505
1.0
510
1.0
515
1.0
520
1.0
525
1.0
530
1.0
535
0.2
540
0.0
545
0.0
550
0.2
555
1.0
560
0.8
565
0.2
570
0.0
575
0.4
580
1.0
585
1.0
590
1.0
595
0.2
600
1.0
605
1.0
610
0.4
615
0.6
620
0.4
625
0.4
630
1.0
635
1.

(0.79624739402362754, 0.3757963385485863)

In [77]:
latest_min = 60 * 30
pred_sec = 5
#traded_day = 
for day in range(0,50,1):
    mean_min_day = []
    for i in range(0,9000-latest_min-pred_sec,pred_sec):
        #print i
        data_train = data_2014_up[day][i:i+latest_min]
        X_train = data_train.drop(["0"],axis=1)
        y_train = data_train['0']
        data_test = data_2014_up[day][i+latest_min:i+latest_min+pred_sec]
        X_test = data_test.drop(["0"],axis=1)
        y_test = data_test['0']
        model = linear_model.LogisticRegression()
        acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
        #print acc
        mean_min_day.append(acc)
    print 'mean = %s, std = %s'%(np.mean(mean_min_day),np.std(mean_min_day))

mean = 0.796247394024, std = 0.375796338549
mean = 0.809312022238, std = 0.363669067068
mean = 0.891730368311, std = 0.279644854915
mean = 0.763724808895, std = 0.392552661556
mean = 0.838776928423, std = 0.330724820891
mean = 0.783043780403, std = 0.382219723166
mean = 0.824600416956, std = 0.347540894789
mean = 0.78318276581, std = 0.376916481471
mean = 0.828353022933, std = 0.348413655751
mean = 0.728978457262, std = 0.41505845627
mean = 0.848922863099, std = 0.328235570288
mean = 0.793050729673, std = 0.372593512892
mean = 0.762473940236, std = 0.389627138428
mean = 0.673523280056, std = 0.425148657294
mean = 0.778874218207, std = 0.386209260551
mean = 0.739402362752, std = 0.407364353487
mean = 0.813203613621, std = 0.358001101925
mean = 0.776511466296, std = 0.392817806041
mean = 0.725503822099, std = 0.417223032538
mean = 0.75872133426, std = 0.406332815159
mean = 0.816539263377, std = 0.357240568606
mean = 0.709659485754, std = 0.408539605857
mean = 0.810701876303, std = 0.3507

51

In [26]:
# only rise ratio
mean_four_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])+len(day_trade[2])-3,1):
    #print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i],data_2014_up[i+1]\
                           ,data_2014_up[i+2]],axis = 0).reset_index(drop=True)
    data_train = data_train#[['0','63']]
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_acc(Feature_data, Label, model, 4)
    print CV_AUC
    mean_four_day.append(CV_AUC)
np.mean(mean_four_day),np.std(mean_four_day)

0.692666666667
0.692027777778
0.691
0.676166666667
0.672138888889
0.663861111111
0.653527777778
0.635055555556
0.649611111111
0.634
0.60675
0.615222222222
0.617611111111
0.612861111111
0.592166666667
0.601361111111
0.594555555556
0.600444444444
0.590194444444
0.628555555556
0.654916666667
0.675194444444
0.641166666667
0.581194444444
0.6245
0.619083333333
0.644361111111
0.627222222222
0.637333333333
0.604111111111
0.628472222222
0.619472222222
0.598333333333
0.595694444444
0.632055555556
0.637416666667
0.60325
0.635083333333
0.675611111111
0.620861111111
0.623805555556
0.633916666667
0.659472222222
0.62925
0.625
0.609277777778
0.612388888889


(0.6312387706855791, 0.028371821573114121)

In [27]:
# only rise ratio
mean_three_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])+len(day_trade[2])-2,1):
    #print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i],data_2014_up[i+1]],\
                           axis = 0).reset_index(drop=True)
    data_train = data_train#[['0','6']]
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_acc(Feature_data, Label, model, 3)
    print CV_AUC
    mean_three_day.append(CV_AUC)
np.mean(mean_three_day),np.std(mean_three_day)

0.728148148148
0.690740740741
0.703740740741
0.677185185185
0.700037037037
0.66137037037
0.672
0.661740740741
0.665814814815
0.646296296296
0.628888888889
0.633777777778
0.608814814815
0.631444444444
0.632777777778
0.612185185185
0.618925925926
0.582037037037
0.605814814815
0.586962962963
0.668185185185
0.697111111111
0.672814814815
0.615296296296
0.589148148148
0.626037037037
0.663518518519
0.657888888889
0.648518518519
0.631740740741
0.636777777778
0.660185185185
0.630259259259
0.617962962963
0.615814814815
0.632592592593
0.659592592593
0.621962962963
0.656481481481
0.668703703704
0.639185185185
0.643740740741
0.65137037037
0.663518518519
0.655
0.620777777778
0.617148148148
0.624962962963


(0.64447916666666671, 0.030933719915200403)

In [30]:
# only rise ratio
mean_two_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])+len(day_trade[2])-1,1):
    print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i]],axis = 0).reset_index(drop=True)
    data_train = data_train
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_ACC = cv_loop_acc(Feature_data, Label, model, 2)
    print CV_ACC
    mean_two_day.append(CV_ACC)
np.mean(mean_two_day),np.std(mean_two_day)

1
0.708777777778
2
0.747388888889
3
0.685944444444
4
0.691444444444
5
0.718722222222
6
0.693611111111
7
0.676944444444
8
0.700388888889
9
0.704277777778
10
0.675722222222
11
0.661722222222
12
0.666277777778
13
0.640555555556
14
0.647777777778
15
0.641222222222
16
0.675166666667
17
0.645277777778
18
0.622555555556
19
0.603333333333
20
0.626333333333
21
0.613166666667
22
0.747777777778
23
0.716055555556
24
0.672
25
0.5895
26
0.669888888889
27
0.695388888889
28
0.701722222222
29
0.703055555556
30
0.657388888889
31
0.628388888889
32
0.673888888889
33
0.670944444444
34
0.638722222222
35
0.631722222222
36
0.623666666667
37
0.648444444444
38
0.703333333333
39
0.642
40
0.663166666667
41
0.698111111111
42
0.670166666667
43
0.652222222222
44
0.699555555556
45
0.719666666667
46
0.651166666667
47
0.635944444444
48
0.635666666667
49
0.638944444444


(0.66785941043083918, 0.035640044708911564)

In [97]:
# only Depth
mean_four_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1]+len(day_trade[2])-3,1):
    #print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i],data_2014_up[i+1]\
                           ,data_2014_up[i+2]],axis = 0).reset_index(drop=True)
    data_train = data_train[['0','19']]
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 4)
    print CV_AUC
    mean_four_day.append(CV_AUC)
np.mean(mean_four_day),np.std(mean_four_day)

0.571798974985
0.577569020798
0.586861808077
0.611873297912
0.614223545204
0.637848497821
0.631198134123
0.587979301303
0.592348415243
0.577784307917
0.578350013682
0.595428310112
0.577624147813
0.572466343699
0.571395033304
0.555353892839
0.555709243857
0.536442703454
0.536958107611
0.543134282918
0.55132561886
0.573089862141
0.556103822793
0.468330162162
0.453868584308
0.465242814923
0.543699114503
0.580054912546
0.594955187296


(0.56548336076564998, 0.042991347954555344)

In [126]:
# only rise ratio
mean_four_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])-2,1):
    #print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i],data_2014_up[i+1]],axis = 0).reset_index(drop=True)
    data_train = data_train#[['0','6']]
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 3)
    print CV_AUC
    mean_four_day.append(CV_AUC)
np.mean(mean_four_day),np.std(mean_four_day)

0.67662676151
0.672647796252
0.697763880051
0.697559737842
0.696048168644
0.714019384577
0.728658961634
0.702018221394
0.682405480155
0.681555212583
0.692427603355
0.69545172047
0.666122747076
0.682898730968
0.685810653739
0.635254602904
0.654802484114
0.634649360295
0.656290588752
0.635270481531
0.679934747308
0.762707329983
0.710908885134
0.672490906954
0.621213581257
0.658965600004
0.73187592294
0.712129833339
0.664679269567
0.671117253994


(0.68247686361088311, 0.030796653504250093)

In [107]:
# only Depth
mean_four_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])-1,1):
    #print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i]],axis = 0).reset_index(drop=True)
    data_train = data_train[['0','6','21']]
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 4)
    print CV_AUC
    mean_four_day.append(CV_AUC)
np.mean(mean_four_day),np.std(mean_four_day)

0.662241639321
0.604646278684
0.612827493544
0.63447771378
0.614835591354
0.677633135501
0.675110734785
0.665476021923
0.625026540492
0.553040840709
0.604451971765
0.636904323475
0.597316907373
0.590800332427
0.608003674167
0.666288240067
0.595670152966
0.535329117793
0.585819272293
0.531895576848
0.58655373117
0.667107274371
0.626122500556
0.630126455909
0.513283952804
0.54708770225
0.576118437028
0.609411987712
0.592878496683
0.612564297232
0.645273950544


(0.60917175308151872, 0.042552156269468171)

In [102]:
mean_four_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])-3,1):
    print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i],data_2014_up[i+1]\
                           ,data_2014_up[i+2]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 4)
    print CV_AUC
    mean_four_day.append(CV_AUC)
np.mean(mean_four_day),np.std(mean_four_day)

1
0.661796149777
2
0.65896797605
3
0.656377116655
4
0.692617632446
5
0.669556707984
6
0.703189906266
7
0.680416709499
8
0.632925147269
9
0.644213735963
10
0.652471352805
11
0.649696372383
12
0.66341352857
13
0.656645992271
14
0.647053826417
15
0.609376970299
16
0.626373859826
17
0.609911759189
18
0.618564605115
19
0.617470814273
20
0.623081198204
21
0.645653434321
22
0.671750526776
23
0.638686942935
24
0.609493293946
25
0.620455762396
26
0.626034650069
27
0.66431447279
28
0.660343642736
29
0.659576439964


(0.64725622507565184, 0.024394034941287109)

In [103]:
mean_three_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])-2,1):
    print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i],data_2014_up[i+1]],
                           axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 3)
    print CV_AUC
    mean_three_day.append(CV_AUC)
np.mean(mean_three_day),np.std(mean_three_day)

1
0.659518466662
2
0.661424982142
3
0.667215201821
4
0.685159318813
5
0.680634851736
6
0.704792795532
7
0.719143760494
8
0.684300850517
9
0.642323442403
10
0.651659866889
11
0.668353513135
12
0.679707509178
13
0.656806578231
14
0.667332648194
15
0.662285975904
16
0.62696746069
17
0.626314786522
18
0.61993030269
19
0.634715806918
20
0.615733583688
21
0.642997260475
22
0.71069228925
23
0.658887347901
24
0.649286809992
25
0.601635517738
26
0.627177207163
27
0.682167767694
28
0.694424394243
29
0.647513464161
30
0.664411205118


(0.65978383219646564, 0.02819964880437361)

In [105]:
mean_two_day = []
for i in range(1,len(day_trade[0])+len(day_trade[1])-1,1):
    print i
    data_train = pd.concat([data_2014_up[i-1],data_2014_up[i]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 3)
    print CV_AUC
    mean_two_day.append(CV_AUC)
np.mean(mean_two_day),np.std(mean_two_day)

1
0.683517942246
2
0.649930451676
3
0.680837249057
4
0.699396590947
5
0.666877703866
6
0.706550898333
7
0.723177674344
8
0.752817256103
9
0.708492928389
10
0.686999857662
11
0.669971584613
12
0.722716845074
13
0.677885441002
14
0.689941651373
15
0.668652996115
16
0.719660419611
17
0.664285331788
18
0.648851996961
19
0.633952792674
20
0.648657512141
21
0.641612964481
22
0.723105577174
23
0.718668400694
24
0.703365064608
25
0.638796318995
26
0.587225776423
27
0.684657668395
28
0.722869549913
29
0.700656713161
30
0.662512245568
31
0.667483943522


(0.6823912692550661, 0.034312285040423419)

In [43]:
mean_three_day = []
for i in range(1,len(day_trade_2)-2,1):
    print i
    data_train = pd.concat([data_2014_2_up[i-1],data_2014_2_up[i],data_2014_2_up[i+1]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 3)
    print CV_AUC
    mean_three_day.append(CV_AUC)
np.mean(mean_three_day),np.std(mean_three_day)

1
0.642997260475
2
0.71069228925
3
0.658887347901
4
0.649286809992
5
0.601635517738
6
0.627177207163
7
0.682167767694
8
0.694424394243
9
0.647513464161
10
0.664411205118


(0.65791932637343109, 0.030441894518751095)

In [44]:
mean_two_day = []
for i in range(1,len(day_trade_2)-1,1):
    print i
    data_train = pd.concat([data_2014_2_up[i-1],data_2014_2_up[i]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 3)
    print CV_AUC
    mean_two_day.append(CV_AUC)
np.mean(mean_two_day),np.std(mean_two_day)

1
0.641612964481
2
0.723105577174
3
0.718668400694
4
0.703365064608
5
0.638796318995
6
0.587225776423
7
0.684657668395
8
0.722869549913
9
0.700656713161
10
0.662512245568
11
0.667483943522


(0.67735947481212133, 0.0408429316536204)

In [None]:
mean_two_day = []
for i in range(1,len(day_trade_1)+len(day_trade_2)-1,1):
    print i
    data_train = pd.concat([data_2014_2_up[i-1],data_2014_2_up[i]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 3)
    print CV_AUC
    mean_two_day.append(CV_AUC)
np.mean(mean_two_day),np.std(mean_two_day)

In [18]:
mean_four_day = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_up[i-1],data_2014_1_up[i],data_2014_1_up[i+1]\
                           ,data_2014_1_up[i+2]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 4)
    print CV_AUC
    mean_four_day.append(CV_AUC)
np.mean(mean_four_day),np.std(mean_four_day)

1
0.6482665898
2
0.649234993474
3
0.657458923811
4
0.695657786666
5
0.669368564252
6
0.69700164744
7
0.683451704848
8
0.636732387742
9
0.64659538748
10
0.657269752604
11
0.651986483333
12
0.659152011559
13
0.649951651681
14
0.641743600791
15
0.609619273847
16
0.619050797725


(0.65453384731589892, 0.023053038869505375)

In [20]:
mean_three_day = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_up[i-1],data_2014_1_up[i],data_2014_1_up[i+1]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 3)
    print CV_AUC
    mean_three_day.append(CV_AUC)
np.mean(mean_three_day),np.std(mean_three_day)

1
0.638519378179
2
0.646801742363
3
0.665250707983
4
0.686643502692
5
0.6894535405
6
0.697338086252
7
0.714962276135
8
0.688061732046
9
0.649769203417
10
0.65872728623
11
0.677555790136
12
0.672687172158
13
0.656915708696
14
0.658690195167
15
0.652191110562
16
0.632771010633


(0.66789615269685354, 0.022128192068738301)

In [22]:
mean_two_day = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_up[i-1],data_2014_1_up[i]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_AUC = cv_loop_auc(Feature_data, Label, model, 2)
    print CV_AUC
    mean_two_day.append(CV_AUC)
np.mean(mean_two_day),np.std(mean_two_day)

1
0.661421408394
2
0.627436084429
3
0.667353521554
4
0.703335048355
5
0.674535300572
6
0.705139482891
7
0.726671955058
8
0.746007567354
9
0.710333911879
10
0.68301892679
11
0.675363645467
12
0.730878772559
13
0.675839337698
14
0.688139193422
15
0.651607666172
16
0.708062203476


(0.68969650162931462, 0.030333494929333359)

In [25]:
mean_two_day_acc = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_up[i-1],data_2014_1_up[i]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_ACC = cv_loop_acc(Feature_data, Label, model, 2)
    print CV_ACC
    mean_two_day_acc.append(CV_ACC)
np.mean(mean_two_day_acc),np.std(mean_two_day_acc)

1
0.6995
2
0.754722222222
3
0.690722222222
4
0.673
5
0.706388888889
6
0.686222222222
7
0.663444444444
8
0.690111111111
9
0.671833333333
10
0.633444444444
11
0.629166666667
12
0.670388888889
13
0.6145
14
0.6435
15
0.610944444444
16
0.645222222222


(0.66769444444444448, 0.036509749458953508)

In [26]:
mean_three_day_acc = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_up[i-1],data_2014_1_up[i],data_2014_1_up[i+1]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_ACC = cv_loop_acc(Feature_data, Label, model, 3)
    print CV_ACC
    mean_three_day_acc.append(CV_ACC)
np.mean(mean_three_day_acc),np.std(mean_three_day_acc)

1
0.734185185185
2
0.69337037037
3
0.69762962963
4
0.668037037037
5
0.701814814815
6
0.658888888889
7
0.668074074074
8
0.644444444444
9
0.62637037037
10
0.617074074074
11
0.62637037037
12
0.623259259259
13
0.601185185185
14
0.61262962963
15
0.595222222222
16
0.599962962963


(0.64803240740740742, 0.040942852782018514)

In [27]:
mean_two_day_acc_down = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_down[i-1],data_2014_1_down[i]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_ACC = cv_loop_acc(Feature_data, Label, model, 2)
    print CV_ACC
    mean_two_day_acc_down.append(CV_ACC)
np.mean(mean_two_day_acc_down),np.std(mean_two_day_acc_down)

1
0.674351851852
2
0.655138888889
3
0.70875
4
0.726342592593
5
0.67837962963
6
0.671990740741
7
0.744953703704
8
0.75662037037
9
0.739074074074
10
0.668101851852
11
0.654166666667
12
0.722916666667
13
0.706759259259
14
0.760555555556
15
0.681064814815
16
0.755833333333


(0.70656249999999998, 0.036733115367711823)

In [32]:
mean_three_day_acc_down = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_down[i-1],data_2014_1_down[i],\
                            data_2014_1_down[i+1]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_ACC = cv_loop_acc(Feature_data, Label, model, 3)
    print CV_ACC
    mean_three_day_acc_down.append(CV_ACC)
np.mean(mean_three_day_acc_down),np.std(mean_three_day_acc_down)

1
0.691635802469
2
0.661913580247
3
0.676666666667
4
0.695555555556
5
0.660462962963
6
0.692561728395
7
0.722777777778
8
0.746913580247
9
0.67799382716
10
0.667901234568
11
0.658518518519
12
0.708055555556
13
0.73487654321
14
0.689660493827
15
0.72237654321
16
0.736327160494


(0.69651234567901232, 0.028142703237994723)

In [33]:
mean_four_day_acc_down = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_down[i-1],data_2014_1_down[i],\
                            data_2014_1_down[i+1],data_2014_1_down[i+2]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_ACC = cv_loop_acc(Feature_data, Label, model, 4)
    print CV_ACC
    mean_four_day_acc_down.append(CV_ACC)
np.mean(mean_four_day_acc_down),np.std(mean_four_day_acc_down)

1
0.686759259259
2
0.656296296296
3
0.672384259259
4
0.676875
5
0.691018518519
6
0.693009259259
7
0.718935185185
8
0.686273148148
9
0.674884259259
10
0.662731481481
11
0.664398148148
12
0.723726851852
13
0.69099537037
14
0.729398148148
15
0.737453703704
16
0.766018518519


(0.69569733796296296, 0.029916322768312586)

In [37]:
mean_five_day_acc_down = []
for i in range(1,17,1):
    print i
    data_train = pd.concat([data_2014_1_down[i-1],data_2014_1_down[i],\
                            data_2014_1_down[i+1],data_2014_1_down[i+2],data_2014_1_down[i+3]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_ACC = cv_loop_acc(Feature_data, Label, model, 5)
    print CV_ACC
    mean_five_day_acc_down.append(CV_ACC)
np.mean(mean_five_day_acc_down),np.std(mean_five_day_acc_down)

1
0.6755
2
0.652851851852
3
0.668296296296
4
0.693777777778
5
0.689111111111
6
0.689018518519
7
0.684018518519
8
0.686425925926
9
0.675055555556
10
0.68087037037
11
0.676796296296
12
0.694518518519
13
0.717611111111
14
0.739240740741
15
0.759018518519
16
0.729814814815


(0.69449537037037046, 0.027202676672366057)

In [36]:
mean_six_day_acc_down = []
for i in range(1,16,1):
    print i
    data_train = pd.concat([data_2014_1_down[i-1],data_2014_1_down[i],\
                            data_2014_1_down[i+1],data_2014_1_down[i+2],\
                            data_2014_1_down[i+3],data_2014_1_down[i+4]],axis = 0).reset_index(drop=True)
    Feature_data = data_train.drop(["0"],axis=1)
    Label = data_train['0']
    model = linear_model.LogisticRegression()
    CV_ACC = cv_loop_acc(Feature_data, Label, model, 6)
    print CV_ACC
    mean_six_day_acc_down.append(CV_ACC)
np.mean(mean_six_day_acc_down),np.std(mean_six_day_acc_down)

1
0.670262345679
2
0.643904320988
3
0.687222222222
4
0.697546296296
5
0.694089506173
6
0.669151234568
7
0.681882716049
8
0.686111111111
9
0.67912037037
10
0.688441358025
11
0.665077160494
12
0.716311728395
13
0.725848765432
14
0.75737654321
15
0.728981481481


(0.69275514403292182, 0.028019201699110586)

In [None]:
latest_min = 60 * 30
pred_sec = 5
#traded_day = 
for day in range(0,50,1):
    mean_min_day = []
    for i in range(0,9000-latest_min-pred_sec,pred_sec):
        #print i
        data_train = data_2014_up[day][i:i+latest_min]
        X_train = data_train.drop(["0"],axis=1)
        y_train = data_train['0']
        data_test = data_2014_up[day][i+latest_min:i+latest_min+pred_sec]
        X_test = data_test.drop(["0"],axis=1)
        y_test = data_test['0']
        model = linear_model.LogisticRegression()
        acc = latest_day_loop_acc(X_train,y_train,X_test,y_test,model)
        #print acc
        mean_min_day.append(acc)
    print 'mean = %s, std = %s'%(np.mean(mean_min_day),np.std(mean_min_day))

In [82]:
from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV)
from sklearn.ensemble import RandomForestClassifier

In [146]:
latest_min = 60 * 30
pred_sec = 5
day = 0

for i in range(0,20,pred_sec): #9000-latest_min-pred_sec,pred_sec):
    
    data_train = data_2014_up[day][i:i+latest_min]
    X_train = data_train.drop(["0"],axis=1)
    y_train = data_train['0']
    data_test = data_2014_up[day][i+latest_min:i+latest_min+pred_sec]
    X_test = data_test.drop(["0"],axis=1)
    y_test = data_test['0']
    
    # hyper-parameter
    num_trees = [3,5,7,9]
    max_depth = [5,10,15]
    criterion = ['entropy']
    min_samples_leaf = [5,10,15]
    min_samples_split = [2]
    max_features = [None]
    
    # cv
    model_grid = {'max_features':max_features,'n_estimators':num_trees,'max_depth':max_depth,\
                  'min_samples_split':min_samples_split,'criterion':criterion,\
                  'min_samples_leaf':min_samples_leaf}
    model = RandomForestClassifier(random_state = 0)
    Grid = GridSearchCV(model, model_grid, cv = 5)
    %time Grid.fit(Feature_data, Label) 
    
    # training & testing
    model.set_params(**Grid.best_params_)
    y = y_train
    X = X_train
    model.fit(X, y)
    predictions = model.predict(X_test)
    acc = metrics.accuracy_score(y_test, predictions)
    print acc
    

CPU times: user 15 s, sys: 35.4 ms, total: 15.1 s
Wall time: 15.8 s
CPU times: user 208 ms, sys: 13 µs, total: 208 ms
Wall time: 227 ms
1.0
CPU times: user 15.7 s, sys: 66.7 ms, total: 15.8 s
Wall time: 16.4 s
CPU times: user 206 ms, sys: 13 µs, total: 206 ms
Wall time: 217 ms
1.0
CPU times: user 15.6 s, sys: 51 ms, total: 15.7 s
Wall time: 16.2 s
CPU times: user 186 ms, sys: 3 µs, total: 186 ms
Wall time: 190 ms
1.0
CPU times: user 13.9 s, sys: 39.5 ms, total: 13.9 s
Wall time: 16.5 s
CPU times: user 177 ms, sys: 0 ns, total: 177 ms
Wall time: 181 ms
1.0


In [83]:
Feature_data = X_train
Label = y_train

In [137]:
num_trees = [3,5,7,9,11]
max_depth = [10,15,20]
criterion = ['entropy']
min_samples_leaf = [5,10,15]
min_samples_split = [2,3]
max_features = [None]

In [138]:
model_grid = {'max_features':max_features,'n_estimators':num_trees,'max_depth':max_depth,\
              'min_samples_split':min_samples_split,'criterion':criterion,\
              'min_samples_leaf':min_samples_leaf}
model1 = RandomForestClassifier(random_state = 0)
Grid = GridSearchCV(model1, model_grid, cv = 5)#, scoring = 'acc')
%time Grid.fit(Feature_data, Label) 
Grid.grid_scores_

CPU times: user 48 s, sys: 232 ms, total: 48.2 s
Wall time: 53.5 s


[mean: 0.86167, std: 0.14414, params: {'min_samples_leaf': 5, 'n_estimators': 3, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': None, 'max_depth': 10},
 mean: 0.86500, std: 0.14706, params: {'min_samples_leaf': 5, 'n_estimators': 5, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': None, 'max_depth': 10},
 mean: 0.86556, std: 0.14689, params: {'min_samples_leaf': 5, 'n_estimators': 7, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': None, 'max_depth': 10},
 mean: 0.87778, std: 0.15205, params: {'min_samples_leaf': 5, 'n_estimators': 9, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': None, 'max_depth': 10},
 mean: 0.90500, std: 0.15379, params: {'min_samples_leaf': 5, 'n_estimators': 11, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': None, 'max_depth': 10},
 mean: 0.86167, std: 0.14414, params: {'min_samples_leaf': 5, 'n_estimators': 3, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': None, 'ma

In [139]:
Grid.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 15,
 'min_samples_split': 2,
 'n_estimators': 11}

In [140]:
model1.set_params(**Grid.best_params_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=15,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=11, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [141]:
y = y_train
X = X_train
%time model1.fit(X, y)
predictions = model1.predict(X_test)
#model1.predict_proba(X_test)[:,1]

CPU times: user 246 ms, sys: 21 µs, total: 246 ms
Wall time: 315 ms


In [142]:
predictions,y_test

(array([ 1.,  1.,  1.,  1.,  1.]), 1800    1.0
 1801    1.0
 1802    1.0
 1803    1.0
 1804    1.0
 Name: 0, dtype: float64)

In [143]:
metrics.accuracy_score(y_test, predictions)

1.0

In [119]:
model1.feature_importances_

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.08346272,  0.        ,
        0.0808834 ,  0.        ,  0.        ,  0.        ,  0.15152391,
        0.06783048,  0.        ,  0.        ,  0.        ,  0.08116691,
        0.13626114,  0.        ,  0.        ,  0.14914777,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.00402056,
        0.00800838,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.00629133,  0.        ,  0.00836175,  0.01133431,
        0.00587051,  0.        ,  0.        ,  0.00780728,  0.        ,
        0.        ,  0.        ,  0.        ,  0.00183899,  0.01030185,
        0.01480221,  0.13082109,  0.0282843 ,  0.        ,  0.        ,
        0.00554158,  0.00095597,  0.00548354,  0.        ])

In [121]:
from sklearn.metrics import confusion_matrix

In [122]:
confusion_matrix(y_test,predictions)

array([[5]])

In [125]:
pd.crosstab(y_test,predictions,rownames = ['actual'],colnames=['preds'])

preds,1.0
actual,Unnamed: 1_level_1
1.0,5


In [92]:
from sklearn.grid_search import (GridSearchCV, RandomizedSearchCV)
from sklearn.ensemble import RandomForestClassifier

Feature_data = data_train.drop(["0"],axis=1)
Label = data_train['0']

num_trees = [10]
max_depth = [None]
criterion = ['entropy']
min_samples_leaf = [1,3,5,7]
min_samples_split = [3,5]
max_features = [None]

model_grid = {'max_features':max_features,'n_estimators':num_trees,'max_depth':max_depth,\
              'min_samples_split':min_samples_split,'criterion':criterion,\
              'min_samples_leaf':min_samples_leaf}

model1 = RandomForestClassifier(random_state = 0)
Grid = GridSearchCV(model1, model_grid, cv = 5, scoring = 'roc_auc')

%time Grid.fit(Feature_data, Label) 
Grid.grid_scores_

CPU times: user 5min 38s, sys: 595 ms, total: 5min 38s
Wall time: 6min 21s


[mean: 0.52566, std: 0.08123, params: {'min_samples_leaf': 1, 'n_estimators': 10, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': None, 'max_depth': None},
 mean: 0.52435, std: 0.08493, params: {'min_samples_leaf': 1, 'n_estimators': 10, 'min_samples_split': 5, 'criterion': 'entropy', 'max_features': None, 'max_depth': None},
 mean: 0.52158, std: 0.08665, params: {'min_samples_leaf': 3, 'n_estimators': 10, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': None, 'max_depth': None},
 mean: 0.52158, std: 0.08665, params: {'min_samples_leaf': 3, 'n_estimators': 10, 'min_samples_split': 5, 'criterion': 'entropy', 'max_features': None, 'max_depth': None},
 mean: 0.52830, std: 0.08738, params: {'min_samples_leaf': 5, 'n_estimators': 10, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': None, 'max_depth': None},
 mean: 0.52830, std: 0.08738, params: {'min_samples_leaf': 5, 'n_estimators': 10, 'min_samples_split': 5, 'criterion': 'entropy', 'max_featu

In [93]:
Grid.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 5,
 'min_samples_split': 3,
 'n_estimators': 10}