## 匯入所需檔案

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data import Dataset
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD 
from torch.autograd import Variable
SEED = 10

## 讀入train和test檔

In [2]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

## 確認該資料有哪些欄位
* head看一下
* 這份資料有Date(日期）、Open Price（開盤價）、Close Price（收盤價）、High Price（當日最高點）、Low Price（當日最低點）、Volume（交易量）

In [3]:
train_df.head()

Unnamed: 0,Date,Open Price,Close Price,High Price,Low Price,Volume
0,02-Jan-2009,902.99,931.8,934.73,899.35,4048270080
1,05-Jan-2009,929.17,927.45,936.63,919.53,5413910016
2,06-Jan-2009,931.17,934.7,943.85,927.28,5392620032
3,07-Jan-2009,927.45,906.65,927.45,902.37,4704940032
4,08-Jan-2009,905.73,909.73,910.0,896.81,4991549952


## 確認資料是否有缺失值
* train set 有2264筆data，無缺失值
* test set 有252筆data，無缺失值

In [4]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264 entries, 0 to 2263
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         2264 non-null   object 
 1   Open Price   2264 non-null   float64
 2   Close Price  2264 non-null   float64
 3   High Price   2264 non-null   float64
 4   Low Price    2264 non-null   float64
 5   Volume       2264 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 106.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         252 non-null    object 
 1   Open Price   252 non-null    float64
 2   Close Price  252 non-null    float64
 3   High Price   252 non-null    float64
 4   Low Price    252 non-null    float64
 5   Volume       252 non-null    int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 

## 預測目標 隔天的股價收盤價漲跌
* 將0設定為跌
* 將1設定為漲
* 為資料新增新的欄位movement，若是收盤價較前一天高則漲，反之則跌。

In [5]:
train_df['Movement'] = 0
test_df['Movement'] = 0

def compare(df):
    for i in range(df.shape[0]-1):
        if df['Close Price'][i] < df['Close Price'][i+1]:
            df.at[i+1,'Movement'] = 1
           
compare(train_df)
compare(test_df)
train_df.head()

Unnamed: 0,Date,Open Price,Close Price,High Price,Low Price,Volume,Movement
0,02-Jan-2009,902.99,931.8,934.73,899.35,4048270080,0
1,05-Jan-2009,929.17,927.45,936.63,919.53,5413910016,0
2,06-Jan-2009,931.17,934.7,943.85,927.28,5392620032,1
3,07-Jan-2009,927.45,906.65,927.45,902.37,4704940032,0
4,08-Jan-2009,905.73,909.73,910.0,896.81,4991549952,1


## 切出特徵和答案

In [6]:
trainX = train_df.drop(columns=['Date','Movement'])
testX = test_df.drop(columns=['Date','Movement'])
trainY = train_df[['Movement']].copy()
testY = test_df[['Movement']].copy()
trainX.head()


Unnamed: 0,Open Price,Close Price,High Price,Low Price,Volume
0,902.99,931.8,934.73,899.35,4048270080
1,929.17,927.45,936.63,919.53,5413910016
2,931.17,934.7,943.85,927.28,5392620032
3,927.45,906.65,927.45,902.37,4704940032
4,905.73,909.73,910.0,896.81,4991549952


### 嘗試以前五天的收盤價作為特徵

In [7]:
pastD = trainX.copy()
pastD = pastD.append(testX,ignore_index=True)
pastD['One'],pastD['Two'],pastD['Three'],pastD['Four'],pastD['Five']= 0,0,0,0,0

for i in range(5):
    for j in range(pastD.shape[0]):
        if j < i+1:
            pastD.iat[j,i+5] = pastD.at[j,'Close Price']
        else:
            pastD.iat[j,i+5] = pastD.at[j-i-1,'Close Price']

pastD_train = pastD[0:2264].copy()
pastD_test = pastD[2264:2516].copy()
pastD_test = pastD_test.reset_index(drop=True)
pastD_trainX = pastD_train.iloc[:,5:10]
pastD_testX = pastD_test.iloc[:,5:10]


## 將數據做前處理及標準化

In [8]:
#將數據作前處理，避免不同尺度的量值相差太多
scaler = StandardScaler()
scaler.fit(trainX)
xtr_std = scaler.transform(trainX)
xte_std = scaler.transform(testX)
scaler1 = StandardScaler()
scaler1.fit(pastD_trainX)
pxtr_std = scaler.transform(pastD_trainX)
pxte_std = scaler.transform(pastD_testX)

## 使用Logistic Regression

In [9]:
model_LR = LogisticRegression(random_state=SEED)
model_LR.fit(xtr_std,trainY)
result_lr = model_LR.predict(xte_std)
test_acc_lr = accuracy_score(testY,result_lr)
print('Test Accuracy: {}'.format(test_acc_lr))
print(confusion_matrix(testY, result_lr))#混淆矩陣


Test Accuracy: 0.8214285714285714
[[ 98  23]
 [ 22 109]]


  y = column_or_1d(y, warn=True)


## 使用RF

In [10]:
model_RF = RandomForestClassifier(random_state=SEED)
model_RF.fit(xtr_std,trainY)
result_rf = model_RF.predict(xte_std)
test_acc_rf = accuracy_score(testY,result_rf)
print('Test Accuracy: {}'.format(test_acc_rf))
print(confusion_matrix(testY, result_rf))#混淆矩陣

  


Test Accuracy: 0.5476190476190477
[[ 21 100]
 [ 14 117]]


## 使用Neural Network
### 將數據修改成適合丟進模型的格式

In [11]:
Y = trainY[:].values
X = xtr_std.reshape(-1,5)
Y = Y.reshape(-1,1)
X = torch.from_numpy(xtr_std).type(torch.FloatTensor)
Y = torch.from_numpy(Y).type(torch.FloatTensor)
NtestX = xte_std.reshape(-1,5)
NtestX = np.array(NtestX)
NtestX = torch.from_numpy(NtestX).type(torch.FloatTensor)

torch_dataset = TensorDataset(X,Y)

loader = DataLoader(
    dataset = torch_dataset,
    batch_size = 5,
    shuffle=True,
    num_workers=2
)
print('Done')

Done


### 定義模型

In [12]:
class MyModel(nn.Module):
        def __init__(self):
            super(MyModel,self).__init__()
            
            self.L1 = nn.Linear(
                in_features=5,
                out_features=14
            )

            self.L5 = nn.Linear(
                in_features=14,
                out_features=1
            )
        
        def forward(self, batch_x):
            y = self.L1(batch_x)
            y = F.relu(y)
            y = self.L5(y)
            y = F.sigmoid(y)
            return y
        def predict(self,x):
            pred = self.forward(x)
            ans = []
            for t in pred:
                if t[0]>0.5:
                    ans.append(1)
                else:
                    ans.append(0)

            return torch.tensor(ans)

print('Done')

Done


### 訓練模型

In [13]:
model = MyModel()
optimizer = SGD(model.parameters(),lr=0.001)
W= torch.tensor(0.4)#給予權重
criterion = nn.BCELoss(weight=W)

epochs = 16
losses = []

for i in range(epochs):
    for step,(batch_x,batch_y) in enumerate(loader):
        y_pred = model(batch_x)  
        loss = criterion(y_pred,batch_y)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    if (i + 1) % 2== 0:
        print('Epoch: {},Step:{}, Loss: {}'.format(i + 1,step, loss))

model_predict = model.eval()
predict = model_predict.predict(NtestX)
predict = predict.numpy()
train_acc = accuracy_score(testY,predict)
print('Test Accuracy: {}'.format(train_acc))
print(confusion_matrix(testY, predict))#混淆矩陣




Epoch: 2,Step:452, Loss: 0.2696243226528168
Epoch: 4,Step:452, Loss: 0.283792644739151
Epoch: 6,Step:452, Loss: 0.22522199153900146
Epoch: 8,Step:452, Loss: 0.278341144323349
Epoch: 10,Step:452, Loss: 0.2830463945865631
Epoch: 12,Step:452, Loss: 0.2612370550632477
Epoch: 14,Step:452, Loss: 0.35098573565483093
Epoch: 16,Step:452, Loss: 0.29331305623054504
Test Accuracy: 0.5833333333333334
[[ 31  90]
 [ 15 116]]


### 讀取已經train好的模型，輸出預測的結果以及準確率和混淆矩陣

In [14]:
#不使用五天的數據，標準化的scale也要換過
saveM = MyModel()
saveM.load_state_dict(torch.load('./best_model.ckpt'))
pred = saveM.predict(NtestX)
pd = pred.numpy()
print(pd)
test_acc = accuracy_score(testY,pd)
print('Test Accuracy: {}'.format(test_acc))
print(confusion_matrix(testY, pd))#混淆矩陣

[1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 1
 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 0 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1]
Test Accuracy: 0.5753968253968254
[[50 71]
 [36 95]]
