In [1]:
# unset PYTHONPATH first
from ko_text import *
from ko_crawler import *

import ast

In [2]:
nlp = NLP()

In [3]:
train_df = pd.read_csv('Data/Train_final.csv', encoding = 'utf-8')
test_df = pd.read_csv('Data/Test_final.csv', encoding = 'utf-8')

# 용량을 줄이기 위해 '단어 단어' 꼴로 묶어둔 token을 ['단어', '단어'] 꼴로 풀기
train_df['Token'] = [token.split() for token in train_df['Token']]
test_df['Token'] = [token.split() for token in test_df['Token']]

In [4]:
print(train_df.shape)

(87899, 2)


In [5]:
Counter(train_df['Section'])

Counter({'IT과학': 9996,
         '경제': 9660,
         '국제': 9844,
         '기업': 9965,
         '문화': 9846,
         '부동산': 9986,
         '사회': 9880,
         '정치': 9295,
         '증권': 9427})

<br>


# **Doc2Vec**

## **1. Doc2Vec 모델 불러오기**

In [8]:
# 가장 성능이 좋았던 모델들 호출
nlp.load_Doc2Vec_model('Doc2Vec_model/Doc2Vec_dm=True&cc=87899&vs=100&win=10&neg=5&min=30&sample=1e-05&epochs=10')

2018-11-14 15:24:54,210 : INFO : loading Doc2Vec object from Doc2Vec_model/Doc2Vec_dm=True&cc=87899&vs=100&win=10&neg=5&min=30&sample=1e-05&epochs=10
2018-11-14 15:24:54,963 : INFO : loading vocabulary recursively from Doc2Vec_model/Doc2Vec_dm=True&cc=87899&vs=100&win=10&neg=5&min=30&sample=1e-05&epochs=10.vocabulary.* with mmap=None
2018-11-14 15:24:54,964 : INFO : loading wv recursively from Doc2Vec_model/Doc2Vec_dm=True&cc=87899&vs=100&win=10&neg=5&min=30&sample=1e-05&epochs=10.wv.* with mmap=None
2018-11-14 15:24:54,966 : INFO : loading docvecs recursively from Doc2Vec_model/Doc2Vec_dm=True&cc=87899&vs=100&win=10&neg=5&min=30&sample=1e-05&epochs=10.docvecs.* with mmap=None
2018-11-14 15:24:54,966 : INFO : loading trainables recursively from Doc2Vec_model/Doc2Vec_dm=True&cc=87899&vs=100&win=10&neg=5&min=30&sample=1e-05&epochs=10.trainables.* with mmap=None
2018-11-14 15:24:54,967 : INFO : loaded Doc2Vec_model/Doc2Vec_dm=True&cc=87899&vs=100&win=10&neg=5&min=30&sample=1e-05&epochs=10

<gensim.models.doc2vec.Doc2Vec at 0x7f1928d82e80>

## **Doc2Vec train** 

In [6]:
'''nlp.train_Doc2Vec_model(train_df['Token'],
                        train_df['Section'],
                        n_epochs = 10)'''

## **Doc2Vec 학습결과 확인**

#### **88000개 문서로 build하고 87000개 문서로 train**

In [9]:
nlp.Doc2Vec_model.train_count

1

In [15]:
nlp.Doc2Vec_model.most_similar('시진핑')

[('국가주석', 0.9011991620063782),
 ('주석', 0.8918991684913635),
 ('방중', 0.7639303207397461),
 ('리커창', 0.7269981503486633),
 ('리잔수', 0.7150949239730835),
 ('방북설', 0.7129294872283936),
 ('중국공산당', 0.711692750453949),
 ('심재훈', 0.6904332637786865),
 ('왕치산', 0.6902913451194763),
 ('공산당', 0.684836745262146)]

In [11]:
nlp.Doc2Vec_model.most_similar('AI')

[('지능', 0.8594276905059814),
 ('인공', 0.8072700500488281),
 ('음성인식', 0.7785195112228394),
 ('자연어', 0.738886296749115),
 ('러닝', 0.7320957183837891),
 ('기계학습', 0.7101413011550903),
 ('빅데이터', 0.6975805759429932),
 ('왓슨', 0.6846472024917603),
 ('사물인터넷', 0.6809580326080322),
 ('CTO', 0.6744200587272644)]

<br>


<br>


## **Data setting**

In [147]:
##################################
# 한 label마다 학습할 단어의 수
train_size_for_each_label = 10000
test_size_for_each_label = 100
###################################


# 분류기의 성능을 테스트하기 위해 선정된 section list
#testing_section_ls = np.unique(train_df['Section'])
testing_section_ls = ['사회','IT과학']

# 전체를 모두 학습하면 시간이 오래걸림.
# 분류기별 성능 비교를 위해, 부분만 학습하기 위한 전처리 작업
train_df2 = train_df[train_df['Section'].isin(testing_section_ls)]
train_df2.index = np.arange(0,len(train_df2))

test_df2 = test_df[test_df['Section'].isin(testing_section_ls)]
test_df2.index = np.arange(0,len(test_df2))

n_class = len(test_df2['Section'].unique())


# Doc2Vec으로 vector를 추정하기 위한 split 과정
train_batch_size = n_class * train_size_for_each_label
test_batch_size = n_class * test_size_for_each_label

X_train, y_train = nlp.extract_a_equally_splited_batch(train_df2['Token'], train_df2['Section'], train_batch_size)
X_test, y_test =  nlp.extract_a_equally_splited_batch(test_df2['Token'],test_df2['Section'], test_batch_size)

print(1)

X_train = nlp.infer_vectors_with_Doc2Vec(X_train)
y_train = y_train

X_test = nlp.infer_vectors_with_Doc2Vec(X_test)
y_test = y_test

from collections import Counter
Counter(y_train)

1


Counter({'IT과학': 9996, '사회': 9880})

<br>


# **Logistic Regression**

In [148]:
clf = LogisticRegression(solver = 'newton-cg',
                         multi_class = 'multinomial')


clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('Accuracy : ', accuracy_score(y_pred, y_test))

Accuracy :  0.935


<br>



# **Decision Tree**

Decision Tree 모델은 feature들이 각각 의미있는 변수들이면서, 독립적으로 사용될 수 있을 때 유용한 방법이다.


각 변수별로 적절한 기준선을 찾아 공간을 나누기 때문.

따라서 Doc2Vec과 같이 좌표평면상에서 벡터의 위치가 아무런 의미가 없는 경우, 학습 효과가 현저하게 떨어진다.

In [149]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [150]:
clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('Accuracy : ', accuracy_score(y_pred, y_test))

Accuracy :  0.84


<br>



# **RandomForestClassifier**

In [151]:
from sklearn.ensemble import RandomForestClassifier

In [152]:
clf = RandomForestClassifier(n_estimators=100,  n_jobs = -1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
    
print('Accuracy : ', accuracy_score(y_pred, y_test))

Accuracy :  0.91


<br>



# **Neural Net**

In [153]:
from sklearn.metrics import confusion_matrix
import tensorflow as tf

## **Data Preprocessing**

In [154]:
X_train_nn = np.array(X_train).astype('float32')
y_train_nn = pd.get_dummies(y_train).values.astype('float32')


X_test_nn = np.array(X_test).astype('float32')
y_test_nn = pd.get_dummies(y_test).values.astype('float32')

In [155]:
print(X_train_nn.shape, y_train_nn.shape, X_test_nn.shape, y_test_nn.shape)

(19876, 100) (19876, 2) (200, 100) (200, 2)


## **Build Layers**

In [162]:
# reset graphs
tf.reset_default_graph() 

# mini-batches
batch_size = X_train_nn.shape[0] // 5
dataset = tf.data.Dataset.from_tensor_slices((X_train_nn, y_train_nn))
dataset = dataset.batch(batch_size)

# building placeholder
X = tf.placeholder(tf.float32, shape = [None, nlp.Doc2Vec_model.vector_size])
Y = tf.placeholder(tf.float32, shape = [None, n_class])
keep_prob = tf.placeholder(tf.float32)

# building layers
n_neuron = 100

W1 = tf.get_variable('W1', shape = ([nlp.Doc2Vec_model.vector_size, n_neuron]), initializer = tf.contrib.layers.xavier_initializer())
W2 = tf.get_variable('W2', shape = ([n_neuron, n_neuron]), initializer = tf.contrib.layers.xavier_initializer())
W3 = tf.get_variable('W3', shape = ([n_neuron, n_neuron]), initializer = tf.contrib.layers.xavier_initializer())
W4 = tf.get_variable('W4', shape = ([n_neuron, n_neuron]), initializer = tf.contrib.layers.xavier_initializer())
W5 = tf.get_variable('W5', shape = ([n_neuron, n_class]), initializer = tf.contrib.layers.xavier_initializer())

b1 = tf.Variable(tf.random_normal([n_neuron]))
b2 = tf.Variable(tf.random_normal([n_neuron]))
b3 = tf.Variable(tf.random_normal([n_neuron]))
b4 = tf.Variable(tf.random_normal([n_neuron]))
b5 = tf.Variable(tf.random_normal([n_class]))

L1 = tf.nn.relu(tf.matmul(X,W1) + b1)
L1 = tf.nn.dropout(L1, keep_prob = keep_prob)

L2 = tf.nn.relu(tf.matmul(L1,W2) + b2)
L2 = tf.nn.dropout(L2, keep_prob = keep_prob)

L3 = tf.nn.relu(tf.matmul(L2,W3) + b3)
L3 = tf.nn.dropout(L3, keep_prob = keep_prob)

L4 = tf.nn.relu(tf.matmul(L3,W4) + b4)
L4 = tf.nn.dropout(L4, keep_prob = keep_prob)

logit = tf.matmul(L4,W5) + b5
hypothesis = tf.nn.softmax(tf.matmul(L4,W5) + b5)


# cost : cross - entropy cost 
lamb = 0.0001
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = logit, labels = Y)) + lamb * tf.reduce_sum(tf.square(W5))

# optimize
learning_rate = 0.0001
train = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)

# prediction
prediction = tf.argmax(hypothesis,1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(Y,1), prediction), dtype= tf.float32))



# restore results
train_cost_list = []
train_acc_list = []

test_cost_list = []
test_acc_list = []


# **Run**

### **mini-batch**

In [None]:
# train_dict = {X: X_train, Y: y_train}
test_dict = {X: X_test_nn, Y: y_test_nn, keep_prob : 1}

training_epochs = 1500

iterator = dataset.make_initializable_iterator()
f, l = iterator.get_next()

# launch graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    
    
    for epoch in range(training_epochs):
        # iterator initialize
        sess.run(iterator.initializer)
        avg_cost = 0

        while True:
            # mini-batch
            try:
                batch_x,  batch_y = sess.run([f, l])
                feed_dict = {X : batch_x, Y: batch_y, keep_prob : 0.7}
                
                c, _ = sess.run([cost, train], feed_dict = feed_dict)
                avg_cost += c
            
            except tf.errors.OutOfRangeError:
                    break
        
        acc, _, test_cost = sess.run([accuracy, prediction, cost], feed_dict = test_dict)
        
        train_cost_list.append(avg_cost)
        test_cost_list.append(test_cost)
            
        if (epoch+1) % (100) == 0 :
            
            test_acc_list.append(acc)
            
            print('Epoch : %s'%(epoch+1), 'cost :',test_cost)
            print('Accuracy :', acc)
            
        
    
    acc, y_pred, test_cost = sess.run([accuracy, prediction, cost], feed_dict = test_dict)
    

In [None]:
plt.figure(figsize = (12,6))
plt.plot(train_cost_list, label = 'train_cost')
plt.plot(test_cost_list, label = 'test_cost')
plt.legend(loc = 'best')

### **full-batch**

In [None]:
# train_dict = {X: X_train, Y: y_train}
test_dict = {X: X_test_nn, Y: y_test_nn, keep_prob : 1}

training_epochs = 1000

iterator = dataset.make_initializable_iterator()
f, l = iterator.get_next()

# launch graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())    
    
    
    for epoch in range(training_epochs):
        # iterator initialize
        avg_cost = 0

        while True:
            # mini-batch
            try:
                feed_dict = {X : X_train_nn, Y: y_train_nn, keep_prob : 0.7}
                
                c, _ = sess.run([cost, train], feed_dict = feed_dict)
                avg_cost += c
            
            except tf.errors.OutOfRangeError:
                    break
        
        acc, y_pred, test_cost = sess.run([accuracy, prediction, cost], feed_dict = test_dict)
        
        train_cost_list.append(avg_cost)
        test_cost_list.append(test_cost)
            
        if (epoch+1) % (100) == 0 :
            
            test_acc_list.append(acc)
            
            print('Epoch : %s'%(epoch+1), 'cost :',test_cost)
            print('Accuracy :', acc)
        #print(sess.run(tf.confusion_matrix(labels = tf.reshape(Y, [-1]), predictions = tf.reshape(y_pred, [-1])), feed_dict = test_dict))


In [None]:
plt.figure(figsize = (12,6))
plt.plot(train_cost_list, label = 'train_cost')
plt.plot(test_cost_list, label = 'test_cost')
plt.legend(loc = 'best')