# Spooky Author Identification
[連結](https://www.kaggle.com/c/spooky-author-identification/submissions?sortBy=date&group=all&page=1)
---

In [1]:
import tensorflow as tf
import numpy as np
import random
import os.path

In [2]:
# 為了讓設定是可以 重複製造
random.seed(1)
tf.set_random_seed(1)
np.random.seed(1)

In [3]:
# 字典檔
WordDictionary = []
PeopleDictionary = ["EAP", "HPL", "MWS"]

In [4]:
# 把垃圾字元去掉
def deleteNotWord(line):
    line = line.replace(",", "")
    line = line.replace(";", "")
    line = line.replace(":", "")
    line = line.replace(".", "")
    line = line.replace("?", "")
    line = line.replace("\'", "")
    line = line.replace("\"", "")
    return line

In [5]:
# 根據單字，找在字典裏面的哪裡
def findIndexInWordDictionary(word):
    word = word.lower()
    for i in range(0, len(WordDictionary)):
        # 如果有找到的話
        if(word == WordDictionary[i]):
            return i
        
    # 沒找到
    return -1

In [6]:
# 找 People Dictionary 裡面，名字在哪個 index
def findIndexInPeopleDictionary(name):
    outputArray = [0] * len(PeopleDictionary)
    for i in range(len(PeopleDictionary)):
        if(PeopleDictionary[i] == name):
            outputArray[i] = 1
    return outputArray

In [7]:
def passAndAddWord(line):
    words = line.split(" ")
    for i in range(0, len(words)):
        if(findIndexInWordDictionary(words[i]) == -1):
            WordDictionary.append(words[i].lower())

In [8]:
# 讀檔
trainData = open("./Data/train.csv", "r", encoding="utf-8")
trainDataList = trainData.read().splitlines()

# 略調第一行
trainDataList = trainDataList[1: len(trainDataList)]

# 要存出去的部分
TextList = []
AuthorList = []

# 判斷有沒有 Dictionary 檔，先讀進來
IsDictionaryExists = os.path.exists("./dictionary.txt")
if(IsDictionaryExists):
    dictionaryFile = open("./dictionary.txt", "r", encoding="utf-8")
    tempDictionaryList = dictionaryFile.read().splitlines()
        
    # 把每一個讀進來
    for line in tempDictionaryList:
        WordDictionary.append(line)
    dictionaryFile.close()
    

# 將檔案做處理
for line in trainDataList:
    # 依照 "," 分，並把最前面的雙引號及後面去掉
    lineData = line.split("\",\"")
    lineData[0] = lineData[0][1:]
    lineData[2] = lineData[2][0:len(lineData[2]) - 1]
    
    lineData[1] = deleteNotWord(lineData[1])
    
    if(not IsDictionaryExists):
        passAndAddWord(lineData[1])
    
    # 加進 List 供以後存取
    TextList.append(lineData[1])
    AuthorList.append(lineData[2])
    
# 讀完資料之後要經過處理，因為怕每次都處理很久，所以存檔起來
if(not IsDictionaryExists):        
    dictionaryFile = open("./dictionary.txt", "w", encoding="utf-8")
    
    # 寫檔
    for i in range(0, len(WordDictionary)):
        dictionaryFile.write(WordDictionary[i] + "\n")
    dictionaryFile.close()
    
# 測試字典檔大小
DictionarySize = len(WordDictionary)
print(DictionarySize)
print(WordDictionary[-1])

25412
agir


In [9]:
# 關閉檔案
trainData.close()

In [10]:
totalFileSize = 19579
testFileSize = int(totalFileSize * 0.01)
trainFileSize = totalFileSize - testFileSize
print("Total File Size: " + format(totalFileSize))
print("Train File Size: " + format(trainFileSize))
print("Test File Size: " + format(testFileSize))

Total File Size: 19579
Train File Size: 19384
Test File Size: 195


In [11]:
# 取出 Test 的 Index
TestIndex = random.sample(range(totalFileSize), testFileSize)

In [12]:
# 判斷是不是在 Test Index 裡面
def findIndexInTestIndex(index):
    for i in range(0, len(TestIndex)):
        if(TestIndex[i] == index):
            return True
    return False

In [13]:
# 判斷字在不在字典裏面，並 Output 有的數目
def CountWordInDictionary(line):
    outputArray = [0] * DictionarySize
    
    lineData = line.split(" ")
    
    # 把在裡面的值 ++
    countInDictionary = 0
    for word in lineData:
        index = findIndexInWordDictionary(word)
        if(index != -1):
            outputArray[index] += 1
            countInDictionary += 1
    
    for i in range(0, DictionarySize):
        outputArray[i] /= countInDictionary
    return outputArray

In [14]:
# 要丟進 Tensorflow 學習的參數
# Train
TrainInputList = []
TrainOutputList = []
# Test
TestInputList = []
TestOutputList = []

# 假設之前沒有做過的話
for i in range(0, len(TextList)):
    # 先處理完第 i 筆資料
    TempInput = CountWordInDictionary(TextList[i])
    TempOutput = findIndexInPeopleDictionary(AuthorList[i])
    
    # 判斷要加到哪裡
    if(not findIndexInTestIndex(i)):
        TrainInputList.append(TempInput)
        TrainOutputList.append(TempOutput)
    else:
        TestInputList.append(TempInput)
        TestOutputList.append(TempOutput)

## 開始建構 Tensorflow

---

In [15]:
layer1_size = 100
layer2_size = 30

In [16]:
# 輸入
inputWordProb = tf.placeholder(tf.float32, [None, len(WordDictionary)], name = "InputProb")

# 輸出
labelProb = tf.placeholder(tf.float32,[None, len(PeopleDictionary)], name = "LabelProb")
  
# 初始化
weghtInit = tf.random_normal_initializer(mean = 0, stddev = 0.3)
biasInit = tf.random_normal_initializer(mean = 0, stddev = 0.1)

# Layer 1
layer1 = tf.layers.dense(
    inputs = inputWordProb,
    units = layer1_size,
    activation = tf.nn.relu,
    kernel_initializer = weghtInit,
    bias_initializer = biasInit,
    name = "Layer1"
)

# Layer 2
layer2 = tf.layers.dense(
    inputs = layer1,
    units = layer1_size,
    activation = tf.nn.relu,
    kernel_initializer = weghtInit,
    bias_initializer = biasInit,
    name = "Layer2"
)

    # Output Layer
layer3 = tf.layers.dense(
    inputs = layer2,
    units = len(PeopleDictionary),
    kernel_initializer = weghtInit,
    bias_initializer = biasInit,
    name = "Layer3"
)

with tf.name_scope("OutputProb"):
    # 最後輸出轉成機率
    outputProb = tf.nn.softmax(layer3, name = "OutputProb")
    predictIndex = tf.argmax(outputProb, 1)
    trueIndex = tf.argmax(labelProb, 1)

In [17]:
# Cost
with tf.name_scope("Cost"):
    cost = tf.nn.softmax_cross_entropy_with_logits(
        logits = outputProb,
        labels = labelProb,
        name = "Cost"
    )
    cost_mean = tf.reduce_mean(cost)
    tf.summary.scalar("Cost", cost_mean)

# Optimize
optimizer = tf.train.AdamOptimizer(learning_rate = 1e-2 ).minimize(cost_mean)

# 算準確率
with tf.name_scope("Accuary"):
    correctList = tf.equal(predictIndex, trueIndex)              # 出來會是一個 boolean 的陣列
    accuracyRate = tf.reduce_mean(tf.cast(correctList, tf.float32))
    
    tf.summary.scalar("Accuary", accuracyRate)

In [18]:
# 建立 Session
session = tf.Session()
session.run(tf.global_variables_initializer())
logs = tf.summary.FileWriter("./logs")

In [19]:
logs.add_graph(session.graph)

In [20]:
batchSize = 100
def batchData():
    # 隨機拿 n 筆資料
    indexList = np.random.choice(len(TrainInputList), batchSize)
    
    textList = []
    authorList = []
    for i in range(0, batchSize):
        textList.append(TrainInputList[indexList[i]])
        authorList.append(TrainOutputList[indexList[i]])
        
    return textList, authorList

In [21]:
totalIter = 0
def train(num_iter):
    global totalIter
    
    for i in range(0, num_iter):
        inputData, outputData = batchData()
        
        feedData = {inputWordProb: inputData, labelProb: outputData}
        session.run(optimizer, feed_dict = feedData)
        
        # 每 5 次寫進 Tensorboard 裡
        if(i % 5 == 0):
            merge = tf.summary.merge_all()
            result = session.run(merge, feed_dict = feedData)
            logs.add_summary(result, i + totalIter)
            
    totalIter += num_iter

In [22]:
# 訓練 1k 次
train(1000)

In [23]:
def print_accuracy():
    feedData = {inputWordProb: TestInputList, labelProb: TestOutputList}
    acc = session.run(accuracyRate, feed_dict= feedData)
    
    # 算準確度
    print("Accuracy: " + format(acc * 100))

In [24]:
print_accuracy()

Accuracy: 56.41025900840759


## 輸出 Test 檔案
---

In [25]:
testFile = open("./Data/test.csv", "r", encoding = "utf-8")
tempTestFile = testFile.read().splitlines()
tempTestFile = tempTestFile[1:]                                    # 把第一行垃圾訊息去掉

testOutputFile = open("./Data/test_pred.csv", "w", encoding = "utf-8")
testOutputFile.write("\"id\",\"EAP\",\"HPL\",\"MWS\"\n")

23

In [26]:
# 處理資料，並預測
testData = []
idList = []
for line in tempTestFile:
    # 依照 "," 分，並把最前面的雙引號及後面去掉
    lineData = line.split("\",\"")
    lineData[0] = lineData[0][1:]
    lineData[1] = deleteNotWord(lineData[1])
    
    idList.append(lineData[0])
    testData.append(CountWordInDictionary(lineData[1]))
    
# 丟進去跑結過
feedData = {inputWordProb: testData}
predResult = session.run(outputProb, feed_dict=feedData)

In [27]:
# 寫出檔案
for i in range(0, len(predResult)):
    testOutputFile.write("\"" +  idList[i] + "\"," + 
                         format(predResult[i][0]) + "," + 
                         format(predResult[i][1]) + "," + 
                         format(predResult[i][2]) + "\n")

In [28]:
testFile.close()
testOutputFile.close()