# 使用 Neural Network 來預測
---

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import random

In [2]:
# 銀行的檔案
trainFile = open("./Data/bank-full.csv","r") 
testFile = open("./Data/bank.csv","r")

# 吐掉第一行
trainFile.readline()
testFile.readline()


# 資料
trainOrgDataArray = []
testOrgDataArray = []

# 讀 Train Data
for line in trainFile:
    lineData = line.split(";")
    
    tempArray = []
    for data in lineData:
        tempArray.append(data.replace("\n",""))
    trainOrgDataArray.append(tempArray)

# 讀 Test Data
for line in testFile:
    lineData = line.split(";")
    
    tempArray = []
    for data in lineData:
        tempArray.append(data.replace("\n",""))
    testOrgDataArray.append(tempArray)
    

trainFile.close()
testFile.close()

# 確定資料的大小
print("Train Data Size: " + format(len(trainOrgDataArray)))
print("Test Data Size: "  + format(len(testOrgDataArray)))

Train Data Size: 45211
Test Data Size: 4521


## 資料愈處理的函數
---

In [3]:
def job(x):
    return {
        "\"admin.\"":         [1,0,0,0,0,0,0,0,0,0,0,0],
        "\"unknown\"":        [0,1,0,0,0,0,0,0,0,0,0,0],
        "\"unemployed\"":     [0,0,1,0,0,0,0,0,0,0,0,0],
        "\"management\"":     [0,0,0,1,0,0,0,0,0,0,0,0],
        "\"housemaid\"":      [0,0,0,0,1,0,0,0,0,0,0,0],
        "\"entrepreneur\"":   [0,0,0,0,0,1,0,0,0,0,0,0],
        "\"student\"":        [0,0,0,0,0,0,1,0,0,0,0,0],
        "\"blue-collar\"":    [0,0,0,0,0,0,0,1,0,0,0,0],
        "\"self-employed\"":  [0,0,0,0,0,0,0,0,1,0,0,0],
        "\"retired\"":        [0,0,0,0,0,0,0,0,0,1,0,0],
        "\"technician\"":     [0,0,0,0,0,0,0,0,0,0,1,0],
        "\"services\"":       [0,0,0,0,0,0,0,0,0,0,0,1]
    }.get(x,                  [1,0,0,0,0,0,0,0,0,0,0,0])

In [4]:
def marital(x):
    return {
        "\"married.\"":       [1,0,0],
        "\"divorced\"":       [0,1,0],
        "\"single\"":         [0,0,1]
    }.get(x,                  [1,0,0])

In [5]:
def education(x):
    return {
        "\"unknown\"":        [1,0,0,0],
        "\"secondary\"":      [0,1,0,0],
        "\"primary\"":        [0,0,1,0],
        "\"tertiary\"":       [0,0,0,1]
    }.get(x,                  [1,0,0,0])

In [6]:
def default(x):
    return {
        "\"yes\"":            [1,0],
        "\"no\"":             [0,1]
    }.get(x,                  [1,0])

In [7]:
def housing(x):
    return {
        "\"yes\"":            [1,0],
        "\"no\"":             [0,1]
    }.get(x,                  [1,0])

In [8]:
def loan(x):
    return {
        "\"yes\"":            [1,0],
        "\"no\"":             [0,1]
    }.get(x,                  [1,0])

In [9]:
def contact(x):
    return {
        "\"unknown\"":        [1,0,0],
        "\"telephone\"":      [0,1,0],
        "\"cellular\"":       [0,0,1]
    }.get(x,                  [1,0,0])

In [10]:
def month(x):
    return {
        "\"jan\"":            [1,0,0,0,0,0,0,0,0,0,0,0],
        "\"feb\"":            [0,1,0,0,0,0,0,0,0,0,0,0],
        "\"mar\"":            [0,0,1,0,0,0,0,0,0,0,0,0],
        "\"apr\"":            [0,0,0,1,0,0,0,0,0,0,0,0],
        "\"may\"":            [0,0,0,0,1,0,0,0,0,0,0,0],
        "\"jun\"":            [0,0,0,0,0,1,0,0,0,0,0,0],
        "\"jul\"":            [0,0,0,0,0,0,1,0,0,0,0,0],
        "\"aug\"":            [0,0,0,0,0,0,0,1,0,0,0,0],
        "\"sep\"":            [0,0,0,0,0,0,0,0,1,0,0,0],
        "\"oct\"":            [0,0,0,0,0,0,0,0,0,1,0,0],
        "\"nov\"":            [0,0,0,0,0,0,0,0,0,0,1,0],
        "\"dec\"":            [0,0,0,0,0,0,0,0,0,0,0,1]
    }.get(x,                  [1,0,0,0,0,0,0,0,0,0,0,0])

In [11]:
def poutcome(x):
    return {
        "\"unknown\"":        [1,0,0,0],
        "\"other\"":          [0,1,0,0],
        "\"failure\"":        [0,0,1,0],
        "\"success\"":        [0,0,0,1]
    }.get(x,                  [1,0,0,0])

In [12]:
def termDeposit(x):
    return {
        "\"yes\"":            1,
        "\"no\"":             0
    }.get(x,                  1)

## 轉乘資料形式
---

In [13]:
# 將某一筆資料，全部轉乘 Training 的資料形式
def TransformDataToTraining(DataList):
    OutputArray = []
    
    OutputArray.extend([int(DataList[0])])
    OutputArray.extend(job(DataList[1]))
    OutputArray.extend(marital(DataList[2]))
    OutputArray.extend(education(DataList[3]))
    OutputArray.extend(default(DataList[4]))
    OutputArray.extend([int(DataList[5])])
    OutputArray.extend(housing(DataList[6]))
    OutputArray.extend(loan(DataList[7]))
    OutputArray.extend(contact(DataList[8]))
    OutputArray.extend([int(DataList[9])])
    OutputArray.extend(month(DataList[10]))
    OutputArray.extend([int(DataList[11])])
    OutputArray.extend([int(DataList[12])])
    OutputArray.extend([int(DataList[13])])
    OutputArray.extend([int(DataList[14])])
    OutputArray.extend(poutcome(DataList[15]))
    
    Ans = termDeposit(DataList[16])
    return OutputArray, Ans

In [14]:
# 要找出最小跟最大的陣列
def FindMinAndMax(DataList):
    OutputArray = []
    
    # 先把每一個的值拿進來
    for i in range(0, len(DataList[0])):
        # 產生暫存的 Array
        tempArray = []
        
        for dataNum in range(0, len(DataList)):
            tempArray.append(DataList[dataNum][i])
        OutputArray.append([min(tempArray), max(tempArray)])
    return OutputArray

In [15]:
# Feature 資料
FeatureDataList = []

# Ans
AnsDataList = []

for i in range(0, len(trainOrgDataArray)):
    FeatureData, AnsData = TransformDataToTraining(trainOrgDataArray[i])
    FeatureDataList.append(FeatureData)
    AnsDataList.append([0, 1] if AnsData == 1 else [1, 0])
    
# 找出最大最小值，為了要 Normalize
MinMaxArrayList = FindMinAndMax(FeatureDataList)
for i in range(0, len(FeatureDataList)):
    for j in range(0, len(FeatureDataList[0])):
        Min = MinMaxArrayList[j][0]
        Max = MinMaxArrayList[j][1]
        FeatureDataList[i][j] = (FeatureDataList[i][j] - Min) / (Max - Min)

In [16]:
# Test Feature 資料
TestFeatureDataList = []

# Ans
TestAnsDataList = []

for i in range(0, len(testOrgDataArray)):
    TestFeatureData, TestAnsData = TransformDataToTraining(testOrgDataArray[i])
    TestFeatureDataList.append(TestFeatureData)
    TestAnsDataList.append([0, 1] if TestAnsData == 1 else [1, 0])
    
# 找出最大最小值，為了要 Normalize
TestMinMaxArrayList = FindMinAndMax(TestFeatureDataList)
for i in range(0, len(TestFeatureDataList)):
    for j in range(0, len(TestFeatureDataList[0])):
        TestMin = TestMinMaxArrayList[j][0]
        TestMax = TestMinMaxArrayList[j][1]
        TestFeatureDataList[i][j] = (TestFeatureDataList[i][j] - TestMin) / (TestMax - TestMin)

In [17]:
# 測試輸入
print("Features")
print(FeatureDataList[0])
print("Label Ans")
print(AnsDataList[0])

# Test 測試
print("Test Features")
print(TestFeatureDataList[0])
print("Test Label Ans")
print(TestAnsDataList[0])

Features
[0.5194805194805194, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.09225936484302653, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.13333333333333333, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05307035380235868, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
Label Ans
[1, 0]
Test Features
[0.16176470588235295, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0684554569737319, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.024826216484607744, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
Test Label Ans
[1, 0]


## 建構 Tensorflow
---

In [18]:
# Size 設定
input_size = len(FeatureDataList[0])
hidden_size = 100
output_size = 2
learning_rate = 1e-6

print(input_size)

51


In [19]:
# Input Size => 51
x = tf.placeholder(tf.float32, [None, input_size], name="Input")

# Output Size => 2 (0, 1) 的機率
y = tf.placeholder(tf.float32, [None, output_size], name="Output")

# 設定中間的  Weight & Bias
with tf.name_scope("Hidden_Layer"):
    hidden_weights = tf.Variable(tf.random_normal([input_size, hidden_size], seed=1))
    hidden_biases = tf.Variable(tf.random_normal([hidden_size], seed = 1))
    hidden_layer = tf.matmul(x, hidden_weights) + hidden_biases

with tf.name_scope("Output_Layer"):
    output_weights = tf.Variable(tf.random_normal([hidden_size, output_size], seed = 1))
    output_biases = tf.Variable(tf.random_normal([output_size], seed = 1))
    output_layer = tf.matmul(hidden_layer, output_weights) + output_biases

In [20]:
with tf.name_scope("Optimize"):
    cost = tf.losses.softmax_cross_entropy(
        logits = output_layer,
        onehot_labels = y)

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    
    tf.summary.scalar("Cost", cost)

In [21]:
with tf.name_scope("Accuracy"):
    label_data = tf.argmax(y, axis=1)
    predict_data = tf.argmax(output_layer, axis=1)

    correct_predict = tf.equal(label_data, predict_data)
    accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32))
    
    tf.summary.scalar("Accuary", accuracy)

In [22]:
session = tf.Session()
session.run(tf.global_variables_initializer())

# 寫檔的部分
logFile = tf.summary.FileWriter("./logs")
#logFile.add_graph(session.graph)

## 抓 Batch 來訓練
---

In [23]:
# 每次抓 100 來訓練
batch_size = 100

In [24]:
def batch_data(batch_size):
    batch_temp_x = []
    batch_temp_y = []
    for i in range(0, batch_size):
        randomSize = len(FeatureDataList)
        
        # 輸出的範圍是有包含頭跟尾
        randomIndex = random.randint(0, randomSize - 1)
        batch_temp_x.append(FeatureDataList[randomIndex])
        batch_temp_y.append(AnsDataList[randomIndex])
        
        # 一定要是 numpy 的陣列!! 並非 List!!
        batch_x = np.array(batch_temp_x, dtype=np.float32)
        batch_y = np.array(batch_temp_y, dtype=np.float32)
    return batch_x, batch_y

In [25]:
def print_accuracy():
    acc = session.run(accuracy, feed_dict={x: FeatureDataList, y: AnsDataList})
    print("Accuracy on Train Data: {0:.1%}".format(acc))
    return acc

In [26]:
def print_test_accuracy():
    acc = session.run(accuracy, feed_dict={x: TestFeatureDataList, y: TestAnsDataList})
    print("Accuracy on Test Data: {0:.1%}".format(acc))
    return acc

In [27]:
total_iteration = 0
def optimize(num_iterations):
    global total_iteration
    
    # 驗證時會用到的參數
    feed_dict_train = {x: FeatureDataList, y: AnsDataList}
    feed_dict_test = {x: TestFeatureDataList, y: TestAnsDataList}
    
    # 寫檔案要分成兩條來做
    logTrain = tf.summary.FileWriter("./logs/100_1e-6/Train_Set/")
    logTest = tf.summary.FileWriter("./logs/100_1e-6/Test_Set/")
    
    for i in range(0, num_iterations):
        # 抓一個部分來做 Train (Stochastic gradient descent)
        data_x, data_y = batch_data(batch_size)
        session.run(optimizer, feed_dict={x: data_x, y: data_y})
        
        if((total_iteration + i) % 50 == 0):
            # 拿所有的 summary
            merge = tf.summary.merge_all()
            print("Train "+ format(i))
            
            # 寫入 Train 的 Data
            merge_train = session.run(merge, feed_dict=feed_dict_train)
            logTrain.add_summary(merge_train, total_iteration + i)
            print_accuracy()
            
            # 寫入 Test 的 Data
            merge_test = session.run(merge, feed_dict=feed_dict_test)
            logTest.add_summary(merge_test, total_iteration + i)
            print_test_accuracy()
            
            # 換行
            print("")
           
    total_iteration += num_iterations

## 訓練結果
---

In [28]:
optimize(10000)

Train 0
Accuracy on Train Data: 30.1%
Accuracy on Test Data: 30.3%

Train 50
Accuracy on Train Data: 30.2%
Accuracy on Test Data: 30.3%

Train 100
Accuracy on Train Data: 30.3%
Accuracy on Test Data: 30.4%

Train 150
Accuracy on Train Data: 30.3%
Accuracy on Test Data: 30.3%

Train 200
Accuracy on Train Data: 30.3%
Accuracy on Test Data: 30.4%

Train 250
Accuracy on Train Data: 30.4%
Accuracy on Test Data: 30.5%

Train 300
Accuracy on Train Data: 30.5%
Accuracy on Test Data: 30.5%

Train 350
Accuracy on Train Data: 30.5%
Accuracy on Test Data: 30.6%

Train 400
Accuracy on Train Data: 30.5%
Accuracy on Test Data: 30.6%

Train 450
Accuracy on Train Data: 30.6%
Accuracy on Test Data: 30.6%

Train 500
Accuracy on Train Data: 30.6%
Accuracy on Test Data: 30.6%

Train 550
Accuracy on Train Data: 30.7%
Accuracy on Test Data: 30.7%

Train 600
Accuracy on Train Data: 30.7%
Accuracy on Test Data: 30.7%

Train 650
Accuracy on Train Data: 30.7%
Accuracy on Test Data: 30.7%

Train 700
Accuracy on T

Train 5800
Accuracy on Train Data: 36.0%
Accuracy on Test Data: 36.4%

Train 5850
Accuracy on Train Data: 36.1%
Accuracy on Test Data: 36.5%

Train 5900
Accuracy on Train Data: 36.1%
Accuracy on Test Data: 36.5%

Train 5950
Accuracy on Train Data: 36.2%
Accuracy on Test Data: 36.6%

Train 6000
Accuracy on Train Data: 36.2%
Accuracy on Test Data: 36.7%

Train 6050
Accuracy on Train Data: 36.3%
Accuracy on Test Data: 36.8%

Train 6100
Accuracy on Train Data: 36.3%
Accuracy on Test Data: 36.8%

Train 6150
Accuracy on Train Data: 36.3%
Accuracy on Test Data: 36.8%

Train 6200
Accuracy on Train Data: 36.4%
Accuracy on Test Data: 36.9%

Train 6250
Accuracy on Train Data: 36.4%
Accuracy on Test Data: 37.0%

Train 6300
Accuracy on Train Data: 36.5%
Accuracy on Test Data: 37.0%

Train 6350
Accuracy on Train Data: 36.6%
Accuracy on Test Data: 37.1%

Train 6400
Accuracy on Train Data: 36.6%
Accuracy on Test Data: 37.2%

Train 6450
Accuracy on Train Data: 36.7%
Accuracy on Test Data: 37.3%

Train 

In [29]:
logFile.close()
session.close()