### 專案名稱: 機器學習練習
### 功能描述: 支票領票時間預測
### 版權所有: Dunk  
### 程式撰寫: Dunk  
### 撰寫日期：2020/04/15
### 改版日期:  
### 改版備註: 2020/04/20 增加特徵值

#### 呼叫所需套件

In [30]:
import os
import numpy as np
import pandas as pd
import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn import linear_model

In [31]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))

#### 設定資料來源

In [32]:
# 設定 data_path
dir_data = './data/'
app_train = pd.read_csv(os.path.join(dir_data, 'TicketTrade_train.csv'))
app_test = app_train['目標值']

app_train['開庫月份'] = app_train['開庫日期'].apply(lambda x: int(str(x)[3:5]))
app_train['開庫日份'] = app_train['開庫日期'].apply(lambda x: int(str(x)[5:7]))

print(app_train.shape)
print(app_test.shape)
print(na_check(app_train))

(3865, 12)
(3865,)


Unnamed: 0,Missing Ratio


None


In [33]:
app_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3865 entries, 0 to 3864
Data columns (total 12 columns):
契約編號        3865 non-null object
庫存支票本流水號    3865 non-null int64
基金別頁編碼      3865 non-null object
開庫日期        3865 non-null int64
摘要          3865 non-null object
基金別         3865 non-null object
異動支票帳號      3865 non-null object
異動支票號碼      3865 non-null object
基金保管人簽收     3865 non-null object
目標值         3865 non-null float64
開庫月份        3865 non-null int64
開庫日份        3865 non-null int64
dtypes: float64(1), int64(4), object(7)
memory usage: 362.5+ KB


In [34]:
app_train.describe()

Unnamed: 0,庫存支票本流水號,開庫日期,目標值,開庫月份,開庫日份
count,3865.0,3865.0,3865.0,3865.0,3865.0
mean,13.285899,1082663.0,11.875286,7.306856,15.490298
std,7.412448,3658.992,1.458522,3.486148,8.809754
min,1.0,1080508.0,9.12,1.0,1.0
25%,7.0,1080724.0,10.78,5.0,8.0
50%,12.0,1081003.0,11.55,8.0,16.0
75%,18.0,1081217.0,13.0,10.0,22.0
max,32.0,1090226.0,17.72,12.0,31.0


In [35]:
# 欄位資料型態
app_train.dtypes

契約編號         object
庫存支票本流水號      int64
基金別頁編碼       object
開庫日期          int64
摘要           object
基金別          object
異動支票帳號       object
異動支票號碼       object
基金保管人簽收      object
目標值         float64
開庫月份          int64
開庫日份          int64
dtype: object

In [36]:
# 種類類別型欄位轉標籤編碼 (Label Encoding)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# le_count = app_train['基金保管人簽收'].nunique()
num_count = 0
# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':        
        # 就做 Label Encoder
        le.fit(app_train[col])
        app_train[col] = le.transform(app_train[col])                        
        # 紀錄有多少個 columns 被標籤編碼過
        num_count += 1
        
print(num_count)
print(app_train.corr()['目標值'].sort_values())
app_train = app_train.drop(['目標值','異動支票號碼'],axis=1)
app_train

7
開庫日期       -0.243525
契約編號       -0.135590
異動支票帳號     -0.087143
摘要         -0.082667
基金別頁編碼     -0.076831
開庫日份       -0.038248
基金別        -0.034756
庫存支票本流水號   -0.031390
基金保管人簽收    -0.030456
異動支票號碼      0.093561
開庫月份        0.197444
目標值         1.000000
Name: 目標值, dtype: float64


Unnamed: 0,契約編號,庫存支票本流水號,基金別頁編碼,開庫日期,摘要,基金別,異動支票帳號,基金保管人簽收,開庫月份,開庫日份
0,4,6,9,1080508,36,33,46,1,5,8
1,9,8,5,1080508,27,11,95,0,5,8
2,26,25,36,1080508,163,36,73,8,5,8
3,26,20,36,1080508,162,36,71,8,5,8
4,26,13,36,1080508,158,36,72,8,5,8
...,...,...,...,...,...,...,...,...,...,...
3860,2,10,10,1090226,37,20,31,2,2,26
3861,5,26,31,1090226,136,19,56,3,2,26
3862,14,21,18,1090226,82,10,7,10,2,26
3863,20,16,21,1090226,96,25,55,10,2,26


In [37]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(app_train, app_test, test_size=0.2, random_state=4)

# 建立一個線性回歸模型
regr = linear_model.LinearRegression()

# 將訓練資料丟進去模型訓練
regr.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = regr.predict(x_test)

In [38]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 1.95


#### 套用其他模型

#### 使用決策樹回歸

[函式用法](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html "函式用法")

In [39]:
from sklearn.tree import DecisionTreeRegressor

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(app_train, app_test, test_size=0.2, random_state=4)

# 建立模型
dtr = DecisionTreeRegressor()

# 訓練模型
dtr.fit(x_train, y_train)

# 預測測試集
y_pred = dtr.predict(x_test)

In [40]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 1.10


##### 使用隨機森林

[函式用法](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html "函式用法")

In [41]:
from sklearn.ensemble import RandomForestRegressor

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(app_train, app_test, test_size=0.2, random_state=4)

# 建立模型
clf = RandomForestRegressor()

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [42]:
# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 0.67
