In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold   
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv("test_set.csv")
test = pd.read_csv("training_set.csv")

In [3]:
train.head(5)

Unnamed: 0,Date,Open Price,Close Price,High Price,Low Price,Volume,Rise,Diff of Close Price
0,2-Jan-18,2683.73,2695.81,2695.89,2682.36,1846463232,,
1,3-Jan-18,2697.85,2713.06,2714.37,2697.77,2090595328,1.0,17.25
2,4-Jan-18,2719.31,2723.99,2729.29,2719.07,2100767744,1.0,10.93
3,5-Jan-18,2731.33,2743.15,2743.45,2727.92,1918869120,1.0,19.16
4,8-Jan-18,2742.67,2747.71,2748.51,2737.6,1894823936,1.0,4.56


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 8 columns):
Date                   252 non-null object
Open Price             252 non-null float64
Close Price            252 non-null float64
High Price             252 non-null float64
Low Price              252 non-null float64
Volume                 252 non-null int64
Rise                   251 non-null float64
Diff of Close Price    251 non-null float64
dtypes: float64(6), int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
# 填補缺失值
from sklearn.impute import SimpleImputer 

imputer = SimpleImputer(strategy='most_frequent') 
rise = train['Rise'].to_numpy().reshape(-1, 1)
imputer.fit(rise)                            
train['Rise'] = imputer.transform(rise)       

In [6]:
train['Diff1'] = train['Open Price'] - train['Close Price']
train['Diff2'] = train['High Price'] - train['Low Price']

In [7]:
#train_x = train[['Open Price','Close Price','High Price','Low Price','Volume','Diff1','Diff2']]
train_x = train[['Volume','Diff1','Diff2']]

train_y = train['Rise']

In [8]:
test.head(5)

Unnamed: 0,Date,Open Price,Close Price,High Price,Low Price,Volume,Rise,Diff of Close Price
0,2-Jan-09,902.99,931.8,934.73,899.35,4048270080,,
1,5-Jan-09,929.17,927.45,936.63,919.53,5413910016,-1.0,-4.35
2,6-Jan-09,931.17,934.7,943.85,927.28,5392620032,1.0,7.25
3,7-Jan-09,927.45,906.65,927.45,902.37,4704940032,-1.0,-28.05
4,8-Jan-09,905.73,909.73,910.0,896.81,4991549952,1.0,3.08


In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264 entries, 0 to 2263
Data columns (total 8 columns):
Date                   2264 non-null object
Open Price             2264 non-null float64
Close Price            2264 non-null float64
High Price             2264 non-null float64
Low Price              2264 non-null float64
Volume                 2264 non-null int64
Rise                   2263 non-null float64
Diff of Close Price    2263 non-null float64
dtypes: float64(6), int64(1), object(1)
memory usage: 141.6+ KB


In [10]:
imputer = SimpleImputer(strategy='most_frequent') 
rise2 = test['Rise'].to_numpy().reshape(-1, 1)
imputer.fit(rise2)                            
test['Rise'] = imputer.transform(rise2)  

In [11]:
test['Diff1'] = test['Open Price'] - test['Close Price']
test['Diff2'] = test['High Price'] - test['Low Price']

In [12]:
#test_x = test[['Open Price','Close Price','High Price','Low Price','Volume','Diff1','Diff2']]
test_x = test[['Volume','Diff1','Diff2']]

test_y = test['Rise']

# Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

kf = KFold(n_splits=5, random_state=1012, shuffle=True)
kf.get_n_splits(train_x)                            

train_acc_list = []                                
valid_acc_list = []                                 

for train_index, valid_index in kf.split(train_x):   
    train_x_split = train_x.iloc[train_index]        
    train_y_split = train_y.iloc[train_index]       
    valid_x_split = train_x.iloc[valid_index]        
    valid_y_split = train_y.iloc[valid_index]        
    
    clf_model = LogisticRegression(C=10, random_state=1012, solver='saga', max_iter=5000, multi_class='multinomial')
    ###
    clf_model.fit(train_x_split, train_y_split)                  ###
    
    train_pred_y = clf_model.predict(train_x_split)              ###
    train_acc = accuracy_score(train_y_split,train_pred_y)
    valid_pred_y = clf_model.predict(valid_x_split)              ###
    valid_acc = accuracy_score(valid_y_split,valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print(('average train accuracy: {}\n' +'    min train accuracy: {}\n' +'    max train accuracy: {}\n' +'average valid accuracy: {}\n' +'    min valid accuracy: {}\n' +'    max valid accuracy: {}')
     .format(np.mean(train_acc_list),np.min(train_acc_list),np.max(train_acc_list),np.mean(valid_acc_list),np.min(valid_acc_list),np.max(valid_acc_list) ))

average train accuracy: 0.5168612383626423
    min train accuracy: 0.4801980198019802
    max train accuracy: 0.5544554455445545
average valid accuracy: 0.4485490196078431
    min valid accuracy: 0.37254901960784315
    max valid accuracy: 0.54


In [14]:
clf_model.fit(train_x, train_y)        ###                
pred_y = clf_model.predict(train_x)    ###

clf_acc = accuracy_score(train_y, pred_y)           
print('accuracy: {}'.format(clf_acc))    

accuracy: 0.5238095238095238


In [15]:
test_pred_y = clf_model.predict(test_x)        ###
test_acc = accuracy_score(test_y, test_pred_y)           
print('test_accuracy: {}'.format(test_acc)) 

test_accuracy: 0.5463780918727915


# Neural Network

In [16]:
from keras import models, layers, optimizers, regularizers
from keras.utils import np_utils

nn_train_x = np.array(train_x)
nn_train_y = np_utils.to_categorical(np.array(train_y))

nn_test_x = np.array(test_x)
nn_test_y = np_utils.to_categorical(np.array(test_y))

nn_model = models.Sequential()

nn_model.add(layers.Dense(input_dim=3,units=256,activation='relu'))
nn_model.add(layers.Dense(units=2,activation='softmax'))

nn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 

history = nn_model.fit(nn_train_x, nn_train_y, epochs=10, batch_size=800, verbose=2)

train_acc = nn_model.evaluate(nn_train_x, nn_train_y, batch_size=32)[1]
print('train_accuracy: {}'.format(train_acc)) 

test_acc = nn_model.evaluate(nn_test_x, nn_test_y, batch_size=32)[1]
print('test_accuracy: {}'.format(test_acc)) 

Using TensorFlow backend.
W0523 21:26:42.003241 22716 deprecation_wrapper.py:119] From C:\Users\VrainsHacker\Anaconda3\envs\python3_5\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0523 21:26:42.049495 22716 deprecation_wrapper.py:119] From C:\Users\VrainsHacker\Anaconda3\envs\python3_5\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0523 21:26:42.063122 22716 deprecation_wrapper.py:119] From C:\Users\VrainsHacker\Anaconda3\envs\python3_5\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0523 21:26:42.139088 22716 deprecation_wrapper.py:119] From C:\Users\VrainsHacker\Anaconda3\envs\python3_5\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.co

Epoch 1/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 2/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 3/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 4/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 5/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 6/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 7/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 8/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 9/10
 - 0s - loss: 16.0541 - acc: 0.0040
Epoch 10/10
 - 0s - loss: 16.0541 - acc: 0.0040
train_accuracy: 0.003968254145648744
test_accuracy: 0.001325088339222615


# Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

kf = KFold(n_splits=5, random_state=1012, shuffle=True)
kf.get_n_splits(train_x)                            

train_acc_list = []                                
valid_acc_list = []                                 

for train_index, valid_index in kf.split(train_x):   
    train_x_split = train_x.iloc[train_index]        
    train_y_split = train_y.iloc[train_index]       
    valid_x_split = train_x.iloc[valid_index]        
    valid_y_split = train_y.iloc[valid_index]        
    
    rfc_model = RandomForestClassifier( n_estimators = 1000,min_samples_split = 20,min_samples_leaf = 1,oob_score = True,random_state = 1,n_jobs = -1 ) 
    ###
    rfc_model.fit(train_x_split, train_y_split)                  ###
    
    train_pred_y = rfc_model.predict(train_x_split)              ###
    train_acc = accuracy_score(train_y_split,train_pred_y)
    valid_pred_y = rfc_model.predict(valid_x_split)              ###
    valid_acc = accuracy_score(valid_y_split,valid_pred_y)
    
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

print(('average train accuracy: {}\n' +'    min train accuracy: {}\n' +'    max train accuracy: {}\n' +'average valid accuracy: {}\n' +'    min valid accuracy: {}\n' +'    max valid accuracy: {}')
     .format(np.mean(train_acc_list),np.min(train_acc_list),np.max(train_acc_list),np.mean(valid_acc_list),np.min(valid_acc_list),np.max(valid_acc_list) ))

average train accuracy: 0.8809615289887198
    min train accuracy: 0.8706467661691543
    max train accuracy: 0.900497512437811
average valid accuracy: 0.7970980392156862
    min valid accuracy: 0.72
    max valid accuracy: 0.9019607843137255


In [18]:
rfc_model.fit(train_x, train_y)        ###                
pred_y = rfc_model.predict(train_x)    ###

acc = accuracy_score(train_y, pred_y)           
print('accuracy: {}'.format(acc))             

accuracy: 0.8809523809523809


In [19]:
test_pred_y = rfc_model.predict(test_x)        ###
test_acc = accuracy_score(test_y, test_pred_y)           
print('test_accuracy: {}'.format(test_acc)) 

test_accuracy: 0.8811837455830389
