# 1. Import libraries

In [1]:
import tensorflow as tf
from konlpy.tag import Mecab
import csv
from collections import Counter
import numpy as np

# 2. Read input data

In [2]:
"""
    row[0] : no
    row[1] : category
    row[2] : title
    row[3] : body
"""
numbers = []
categories = []
titles = []
contents = []
with open('input_11829.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        numbers.append(row[0])
        categories.append(row[1])
        titles.append(row[2])
        contents.append(row[3])

# 3. Extract nouns through Mecab

In [3]:
mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic/')

In [4]:
for i, title in enumerate(titles):
    titles[i] = mecab.nouns(title)
    
for i, content in enumerate(contents):
    contents[i] = mecab.nouns(content)

In [5]:
words_counts = Counter()

In [6]:
for title in titles:
    for word in title:
        words_counts[word] += 1
for content in contents:
    for word in content:
        words_counts[word] += 1 

In [7]:
words = set(words_counts.keys())
words_size = len(words)
print(words_size)

29861


In [8]:
word2index = {}

for i, word in enumerate(words):
    word2index[word] = i
word2index

{'캠코더': 0,
 '난봉': 1,
 '市': 2,
 '란트': 3,
 '철광석': 4,
 '즉석': 5,
 '사신': 6,
 '전작': 7,
 '아군': 8,
 '사별': 9,
 '지아비': 10,
 '련지': 11,
 '허주원': 12,
 '전대미문': 13,
 '설치목': 14,
 '치욕': 15,
 '비비에스': 16,
 '쿠키': 17,
 '그레': 18,
 '여알': 19,
 '왠': 20,
 '작가': 21,
 '은퇴': 22,
 '해골바가지': 23,
 '돈장사': 24,
 '강도지': 25,
 '연근': 26,
 '경각심': 27,
 '신세계': 28,
 '융자': 29,
 '침투': 30,
 '성폭행': 31,
 '옵션': 32,
 '청도군': 33,
 '기아차': 34,
 '과중': 35,
 '반납': 36,
 '양법': 37,
 '저수지': 38,
 '인지': 39,
 '時': 40,
 '품귀': 41,
 '甲': 42,
 '김태현': 43,
 '미어': 44,
 '등반': 45,
 '보릿자루': 46,
 '단두대': 47,
 '말벌': 48,
 '가름길': 49,
 '취미': 50,
 '나경원': 51,
 '공세동': 52,
 '지주': 53,
 '강단': 54,
 '피해자': 55,
 '밑줄': 56,
 '동포': 57,
 '갈림길': 58,
 '기하급수': 59,
 '상부상조': 60,
 '조혜련': 61,
 '철창': 62,
 '편차': 63,
 '산품': 64,
 '남일': 65,
 '으': 66,
 '네다섯': 67,
 '달팽이': 68,
 '맏사위': 69,
 '후배': 70,
 '저층': 71,
 '주택기금': 72,
 '한상': 73,
 '폐경기': 74,
 '센텀': 75,
 '나름': 76,
 '자기계': 77,
 '병충': 78,
 '구전': 79,
 '화요일': 80,
 '킹': 81,
 '이분': 82,
 '국민학생': 83,
 '현물': 84,
 '아이스': 85,
 '전처': 86,
 '레토': 87,
 '손

In [9]:
targets = set(categories)
targets2index = {}

for i, target in enumerate(targets):
    targets2index[target] = i
targets2index

{'미래': 0,
 '행정': 1,
 '보건복지': 2,
 '정치개혁': 3,
 '안전/환경': 4,
 '저출산/고령화대책': 5,
 '일자리': 6,
 '농산어촌': 7,
 '외교/통일/국방': 8,
 '성장동력': 9,
 '육아/교육': 10}

In [21]:
class NeuralNetwork:
    def __init__(self, input_size, output_size, learning_rate):
        with tf.variable_scope("main"):
            self.input_size = input_size
            self.output_size = output_size
                
            self.inputs = tf.placeholder(tf.float32, [None, input_size], name='inputs')
            self.targets = tf.placeholder(tf.int32, [None], name='targets')
            self.one_hot_targets = tf.one_hot(self.targets, output_size)

            self.hidden1 = tf.layers.dense(inputs=self.inputs, units=10000, activation=tf.nn.relu, name='hidden1')
            self.hidden2 = tf.layers.dense(inputs=self.hidden1, units=1000, activation=tf.nn.relu, name='hidden2')        
            self.hidden3 = tf.layers.dense(inputs=self.hidden2, units=100, activation=tf.nn.relu, name='hidden3')
            self.output = tf.layers.dense(inputs=self.hidden3, units=output_size, activation=tf.nn.softmax, name='output')        
            
            self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.one_hot_targets, logits=self.output))
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

In [22]:
tf.reset_default_graph()

NN = NeuralNetwork(len(word2index), len(targets2index), 0.001)

In [23]:
init_op = tf.global_variables_initializer()

save_file = './train_1000.ckpt'
saver = tf.train.Saver()

# 4. Training

In [24]:
mini_batch = 16
training_episodes = 1000

with tf.Session() as sess:
    sess.run(init_op)
    
    for step in range(training_episodes):
        idx = np.random.randint(len(numbers)-100, size=mini_batch)
        
        #word to one_hot_encoding
        inputs = np.zeros((mini_batch, len(word2index)))
        for i in range(mini_batch):
            for word in contents[idx[i]]:
                inputs[i][word2index[word]] = 1.0

        #label to one_hot_encoding
        index = []
        for i in range(mini_batch):
            index.append(targets2index[categories[idx[i]]])
        index = np.array(index)
        
        loss, opt, one_hot, output = sess.run([NN.loss, NN.opt, NN.one_hot_targets, NN.output], feed_dict={NN.inputs: inputs.reshape(-1, len(word2index)), NN.targets: index})
        print('step: {}, loss: {:0.3f}'.format(step, loss))

    save_path = saver.save(sess, save_file)
    print("Model saved in path: %s" % save_path)

step: 0, loss: 2.398
step: 1, loss: 2.397
step: 2, loss: 2.392
step: 3, loss: 2.379
step: 4, loss: 2.366
step: 5, loss: 2.442
step: 6, loss: 2.349
step: 7, loss: 2.252
step: 8, loss: 2.401
step: 9, loss: 2.300
step: 10, loss: 2.453
step: 11, loss: 2.341
step: 12, loss: 2.219
step: 13, loss: 2.305
step: 14, loss: 2.506
step: 15, loss: 2.336
step: 16, loss: 2.378
step: 17, loss: 2.268
step: 18, loss: 2.204
step: 19, loss: 2.338
step: 20, loss: 2.211
step: 21, loss: 2.281
step: 22, loss: 2.173
step: 23, loss: 2.206
step: 24, loss: 2.136
step: 25, loss: 2.296
step: 26, loss: 2.257
step: 27, loss: 2.375
step: 28, loss: 2.303
step: 29, loss: 2.338
step: 30, loss: 2.340
step: 31, loss: 2.389
step: 32, loss: 2.137
step: 33, loss: 2.150
step: 34, loss: 2.288
step: 35, loss: 2.285
step: 36, loss: 2.369
step: 37, loss: 2.132
step: 38, loss: 2.234
step: 39, loss: 2.368
step: 40, loss: 2.208
step: 41, loss: 2.225
step: 42, loss: 2.379
step: 43, loss: 2.388
step: 44, loss: 2.287
step: 45, loss: 2.15

step: 361, loss: 2.109
step: 362, loss: 2.100
step: 363, loss: 2.144
step: 364, loss: 2.086
step: 365, loss: 2.024
step: 366, loss: 2.166
step: 367, loss: 2.035
step: 368, loss: 2.113
step: 369, loss: 2.102
step: 370, loss: 2.167
step: 371, loss: 2.224
step: 372, loss: 2.291
step: 373, loss: 2.063
step: 374, loss: 1.856
step: 375, loss: 1.977
step: 376, loss: 2.227
step: 377, loss: 2.166
step: 378, loss: 2.285
step: 379, loss: 2.000
step: 380, loss: 2.088
step: 381, loss: 1.950
step: 382, loss: 2.223
step: 383, loss: 1.667
step: 384, loss: 1.997
step: 385, loss: 2.043
step: 386, loss: 1.912
step: 387, loss: 1.741
step: 388, loss: 1.917
step: 389, loss: 1.964
step: 390, loss: 2.043
step: 391, loss: 2.137
step: 392, loss: 2.070
step: 393, loss: 2.041
step: 394, loss: 1.973
step: 395, loss: 2.043
step: 396, loss: 1.978
step: 397, loss: 2.059
step: 398, loss: 1.972
step: 399, loss: 2.104
step: 400, loss: 1.946
step: 401, loss: 1.980
step: 402, loss: 1.953
step: 403, loss: 1.975
step: 404, 

step: 718, loss: 2.040
step: 719, loss: 1.980
step: 720, loss: 2.035
step: 721, loss: 2.038
step: 722, loss: 2.041
step: 723, loss: 2.166
step: 724, loss: 1.918
step: 725, loss: 2.041
step: 726, loss: 2.037
step: 727, loss: 1.856
step: 728, loss: 1.981
step: 729, loss: 1.979
step: 730, loss: 2.157
step: 731, loss: 1.933
step: 732, loss: 2.056
step: 733, loss: 2.045
step: 734, loss: 2.110
step: 735, loss: 1.872
step: 736, loss: 1.731
step: 737, loss: 1.918
step: 738, loss: 1.855
step: 739, loss: 2.105
step: 740, loss: 1.824
step: 741, loss: 2.233
step: 742, loss: 1.856
step: 743, loss: 1.972
step: 744, loss: 2.102
step: 745, loss: 2.042
step: 746, loss: 2.042
step: 747, loss: 2.103
step: 748, loss: 1.984
step: 749, loss: 1.981
step: 750, loss: 1.938
step: 751, loss: 2.227
step: 752, loss: 2.355
step: 753, loss: 2.030
step: 754, loss: 1.980
step: 755, loss: 2.042
step: 756, loss: 1.856
step: 757, loss: 2.047
step: 758, loss: 2.126
step: 759, loss: 1.981
step: 760, loss: 2.168
step: 761, 

# 5. Validation

In [31]:
tf.reset_default_graph()
trainedNN = NeuralNetwork(len(word2index), len(targets2index), 0.001)

saver = tf.train.Saver()


with tf.Session() as sess:
    saver.restore(sess, './train_1000.ckpt')

    cnt = 0
    print("Validation")
    for i in range(len(numbers)-100, len(numbers)):
        inputs = np.zeros(len(word2index))
        for word in contents[i]:
            inputs[word2index[word]] = 1.0
            
        output = sess.run([trainedNN.output], feed_dict={trainedNN.inputs: inputs.reshape(-1, len(word2index))})
        index = np.argmax(output)
        #print('output: {}, index: {}'.format(output, index))
        if list(targets2index)[index] == categories[i]:
            cnt += 1
        
        print("logits: {}, label: {}".format(list(targets2index)[index], categories[i]))
    print("Result: {}%".format(cnt))

INFO:tensorflow:Restoring parameters from ./train_1000.ckpt
Validation
logits: 보건복지, label: 행정
logits: 육아/교육, label: 성장동력
logits: 일자리, label: 일자리
logits: 정치개혁, label: 정치개혁
logits: 정치개혁, label: 정치개혁
logits: 정치개혁, label: 안전/환경
logits: 정치개혁, label: 외교/통일/국방
logits: 일자리, label: 일자리
logits: 안전/환경, label: 안전/환경
logits: 보건복지, label: 보건복지
logits: 보건복지, label: 보건복지
logits: 외교/통일/국방, label: 정치개혁
logits: 안전/환경, label: 안전/환경
logits: 정치개혁, label: 미래
logits: 일자리, label: 일자리
logits: 보건복지, label: 보건복지
logits: 정치개혁, label: 안전/환경
logits: 정치개혁, label: 일자리
logits: 육아/교육, label: 행정
logits: 정치개혁, label: 안전/환경
logits: 정치개혁, label: 행정
logits: 외교/통일/국방, label: 외교/통일/국방
logits: 정치개혁, label: 미래
logits: 정치개혁, label: 일자리
logits: 육아/교육, label: 육아/교육
logits: 정치개혁, label: 보건복지
logits: 정치개혁, label: 보건복지
logits: 외교/통일/국방, label: 안전/환경
logits: 정치개혁, label: 일자리
logits: 일자리, label: 일자리
logits: 정치개혁, label: 일자리
logits: 보건복지, label: 안전/환경
logits: 정치개혁, label: 정치개혁
logits: 보건복지, label: 저출산/고령화대책
logits: 안전/환경, label: 안전/환경
l