# Sentence classification by MorphConv
Implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) to classify sentiment of movie review

### Explanation of this notebook
* Dataset : [Naver sentiment movie corpus v1.0](https://github.com/e9t/nsmc)
    + train, validation : splitting `ratings_train.txt` (150k reviews) for train (120k reviews) and validation (30k reviews)
    + test : `ratings_test.txt` (50k reviews)
* Preprocessing
    + Morphological analysis by Mecab wrapped by [konlpy](http://konlpy.org/en/latest/)
    + Using [FastText](https://arxiv.org/abs/1607.04606) embedding by [gluonnlp package](https://gluon-nlp.mxnet.io/)

### Setup

In [1]:
import os, sys
import gluonnlp as nlp
import numpy as np
import pandas as pd
import tensorflow as tf
import itertools
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm_notebook as tqdm

print(tf.__version__)

import khaiii
import re

1.12.0


### Loading dataset

In [2]:
ratings = pd.read_csv('./data/ratings_train.txt', sep = '\t')[['document', 'label']]
ratings_tst = pd.read_csv('./data/ratings_test.txt', sep = '\t')[['document', 'label']]

# ratings, ratings_tst의 document column에 nan 값이 있으므로 이를 빈 문자열로 대체
print(sum(ratings.document.isna()), sum(ratings_tst.document.isna()))

ratings.document[ratings.document.isna()] = ''
ratings_tst.document[ratings_tst.document.isna()] = ''

print(sum(ratings.document.isna()), sum(ratings_tst.document.isna()))

5 3
0 0


In [3]:
val_indices = np.random.choice(a = range(ratings.shape[0]), size = int(ratings.shape[0] * .2),
                               replace = False)
tr_indices = np.delete(arr = range(ratings.shape[0]), obj = val_indices, axis = 0)

ratings_tr = ratings.iloc[tr_indices,:]
ratings_val = ratings.iloc[val_indices,:]

print(ratings_tr.shape, ratings_val.shape, ratings_tst.shape)

(120000, 2) (30000, 2) (50000, 2)


In [4]:
# ## khaiii test

# tag_list = ['NNG', 'NNP', 'NNB', 'NP', 'NR', 'VV', 'VA', 
#             'VX', 'VCP', 'VCN', 'MM', 'MAG', 'MAJ', 'IC']

# text = '재미없다 지루하고. 같은 음식 영화인데도 바베트의 만찬하고 넘 차이남....바베트의 만찬은 이야기도 있고 음식 보는재미도 있는데 ; 이건 볼게없다 음식도 별로 안나오고, 핀란드 풍경이라도 구경할랫는데 그것도 별로 안나옴 ㅡㅡ'
# # text = re.sub('[^ㄱ-ㅎ|ㅏ-ㅣ|가-힣]', ' ', text)
# # text = re.sub('\s+', ' ', text)

# result_word = [m.lex for word in api.analyze(text)
#                       for m in word.morphs
#                         if m.tag in tag_list]
# result_word[:10]

### Preprocessing dataset

In [6]:
def make_morphs(text):
    tag_list = ['NNG', 'NNP', 'NNB', 'NP', 'NR', 'VV', 'VA', 
            'VX', 'VCP', 'VCN', 'MM', 'MAG', 'MAJ', 'IC']
    
    # text = re.sub('[^ㄱ-ㅎ|ㅏ-ㅣ|가-힣]', ' ', text)
    # text = re.sub('\s+', ' ', text)

    if not text.strip():
        return []
    
    result = api.analyze(text)
    result_word = [m.lex for word in result
                      for m in word.morphs
                        if m.tag in tag_list]
    return result_word

In [5]:
# mecab = konlpy.tag.Mecab() # 어떠한 분석기라도 상관이 없음
api = khaiii.KhaiiiApi()
api.open()

Path: /usr/local/lib/python3.6/dist-packages/khaiii


In [7]:
%%time
# train
print('Make train data......')
X_tr = ratings_tr.document.apply(make_morphs).tolist()
y_tr = ratings_tr.label.tolist()

# validation
print('Make validation data......')
X_val = ratings_val.document.apply(make_morphs).tolist()
y_val = ratings_val.label.tolist()

# test
print('Make test data......')
X_tst = ratings_tst.document.apply(make_morphs).tolist()
y_tst = ratings_tst.label.tolist()

Make train data......
Make validation data......
Make test data......
CPU times: user 2min 37s, sys: 364 ms, total: 2min 37s
Wall time: 2min 37s


In [None]:
api.close()

#### Building vocabulary and connecting vocabulary with fasttext embedding
https://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding.html

In [8]:
# training dataset 기반으로 vocab 생성
counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c in X_tr]))
vocab = nlp.Vocab(counter,bos_token=None, eos_token=None, min_freq=10)

Counter({'아': 4091,
         '더빙': 310,
         '진짜': 4694,
         '짜증': 834,
         '나': 10268,
         '네': 847,
         '목소리': 236,
         '보': 34221,
         '것': 10883,
         '교도소': 6,
         '이야기': 1548,
         '이': 58149,
         '솔직히': 811,
         '재미': 2999,
         '없': 11961,
         '평점': 4549,
         '조정': 36,
         '페그': 2,
         '익살': 6,
         '연기': 4885,
         '돋보이': 182,
         '영화': 37450,
         '스파이더맨': 35,
         '늙': 150,
         '하': 18287,
         '커': 81,
         '스': 1356,
         '틴': 45,
         '던': 59,
         '스트': 57,
         '너무나': 552,
         '이쁘': 602,
         '보이': 1962,
         '막': 319,
         '걸음마': 2,
         '떼': 118,
         '세': 654,
         '초등학교': 86,
         '살': 1453,
         '용영화': 8,
         'ㅋㅋㅋ': 1124,
         '.별반개': 3,
         '아깝': 2904,
         '액션': 1820,
         '있': 13554,
         '몇': 908,
         '안': 4867,
         '왜케': 86,
         '낮': 953,
         '꽤': 36

In [9]:
# Loading fasttext embedding 
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

# vocab에 embedding 연결
vocab.set_embedding(fasttext_simple)

In [10]:
%%time
# final preprocessing

X_tr = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_tr))
X_tr = pad_sequences(sequences = X_tr, maxlen = 30, padding = 'pre', value = 1.)

X_val = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_val))
X_val = pad_sequences(sequences = X_val, maxlen = 30, padding = 'pre', value = 1.)

X_tst = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], X_tst))
X_tst = pad_sequences(sequences = X_tst, maxlen = 30, padding = 'pre', value = 1.)

CPU times: user 2.51 s, sys: 24 ms, total: 2.53 s
Wall time: 2.55 s


### Define MorphConv class

In [11]:
class MorphConv:
    def __init__(self, X, y, n_of_classes, embedding):
        
        with tf.variable_scope('input_layer'):
            self.__X = X
            self.__y = y
            self.is_training = tf.placeholder(dtype = tf.bool)
        
        with tf.variable_scope('embedding_layer'):
            static_embed = tf.get_variable(name = 'static', initializer = embedding,
                                           trainable = False)
            non_static_embed = tf.get_variable(name = 'non_static', initializer = embedding,
                                               trainable = True)
            static_batch = tf.nn.embedding_lookup(params = static_embed, ids = self.__X)
            non_static_batch = tf.nn.embedding_lookup(params = non_static_embed, ids = self.__X)
            
        with tf.variable_scope('convoluion_layer'):
            with tf.variable_scope('tri_gram'):
                
                tri_gram = keras.layers.Conv1D(filters = 100, kernel_size = 3,
                                               activation = keras.activations.relu,
                                               kernel_initializer = 'he_uniform', padding = 'valid')
                static_3 = tri_gram(static_batch)
                non_static_3 = tri_gram(non_static_batch)
            
            with tf.variable_scope('tetra_gram'):
                tetra_gram = keras.layers.Conv1D(filters = 100, kernel_size = 4,
                                                 activation = keras.activations.relu,
                                                 kernel_initializer = 'he_uniform', padding = 'valid')
                
                static_4 = tetra_gram(static_batch)
                non_static_4 = tetra_gram(non_static_batch)
            
            with tf.variable_scope('penta_gram'):
                penta_gram = keras.layers.Conv1D(filters = 100, kernel_size = 5,
                                                 activation = keras.activations.relu,
                                                 kernel_initializer = 'he_uniform', padding = 'valid')
                
                static_5 = penta_gram(static_batch)
                non_static_5 = penta_gram(non_static_batch)

            fmap_3 = tf.reduce_max(static_3 + non_static_3, axis = 1)
            fmap_4 = tf.reduce_max(static_4 + non_static_4, axis = 1)
            fmap_5 = tf.reduce_max(static_5 + non_static_5, axis = 1)
            
        with tf.variable_scope('output_layer'):
            flattened = tf.concat([fmap_3, fmap_4, fmap_5], axis = -1)
            score = keras.layers.Dense(units = n_of_classes,
                                       kernel_regularizer = keras.regularizers.l2(.7))(flattened)
            
            self.__score = keras.layers.Dropout(rate = .5)(score, training = self.is_training)

        with tf.variable_scope('loss'):
            ce_loss = tf.losses.sparse_softmax_cross_entropy(labels = self.__y, logits = self.__score)
            reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
            self.total_loss = ce_loss + reg_term
        
        with tf.variable_scope('prediction'):
            self.prediction = tf.argmax(self.__score, axis = -1)
        
    # predict instance method for small dataset
    def predict(self, sess, x_data, is_training = False):
        feed_prediction = {self.__X : x_data, self.is_training : is_training}
        return sess.run(self.prediction, feed_dict = feed_prediction)

### Create a model of MorphConv

In [80]:
# hyper-parameter
lr = 0.002
epochs = 30
batch_size = 5000
total_step = int(X_tr.shape[0] / batch_size)
print(total_step)

24


In [81]:
# train
tr_dataset = tf.data.Dataset.from_tensor_slices((X_tr, y_tr))
tr_dataset = tr_dataset.shuffle(buffer_size = 1000000)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()

In [82]:
# val
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(batch_size = batch_size)
val_iterator = val_dataset.make_initializable_iterator()

In [83]:
# anonymous iterator
handle = tf.placeholder(dtype = tf.string)
iterator = tf.data.Iterator.from_string_handle(string_handle = handle,
                                               output_types = tr_iterator.output_types,
                                               output_shapes = tr_iterator.output_shapes)
x_data, y_data = iterator.get_next()

In [84]:
morph_conv = MorphConv(X = x_data, y = y_data, n_of_classes = 2,
                       embedding = vocab.embedding.idx_to_vec.asnumpy())

In [85]:
# create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = morph_conv.total_loss)

In [86]:
saver = tf.train.Saver(max_to_keep=30)
save_dir = 'checkpoints/'
os.makedirs(save_dir, exist_ok=True)

### Training

In [87]:
# sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
sess_config = tf.ConfigProto(device_count = {'GPU': 0})
sess = tf.Session(config = sess_config)
sess.run(tf.global_variables_initializer())
tr_handle, val_handle = sess.run(fetches = [tr_iterator.string_handle(), val_iterator.string_handle()])

In [88]:
%%time

tr_loss_hist = []
val_loss_hist = []

for epoch in tqdm(range(epochs)):

    avg_tr_loss = 0
    avg_val_loss = 0
    tr_step = 0
    val_step = 0

    # for mini-batch training
    sess.run(tr_iterator.initializer)    
    try:
        
        while True:
            _, tr_loss = sess.run(fetches = [training_op, morph_conv.total_loss],
                                             feed_dict = {handle : tr_handle, morph_conv.is_training : True})
            avg_tr_loss += tr_loss
            tr_step += 1

    except tf.errors.OutOfRangeError:
        pass

    # for validation
    sess.run(val_iterator.initializer)
    try:
        while True:
            val_loss = sess.run(fetches = morph_conv.total_loss,
                                feed_dict = {handle : val_handle, morph_conv.is_training : False})
            avg_val_loss += val_loss
            val_step += 1
    
    except tf.errors.OutOfRangeError:
        pass

    avg_tr_loss /= tr_step
    avg_val_loss /= val_step
    tr_loss_hist.append(avg_tr_loss)
    val_loss_hist.append(avg_val_loss)
    
    saver.save(sess=sess, 
               save_path=save_dir+str(epoch).zfill(3)+'_'+str(int(avg_train_loss*1000)).zfill(4)+'_'+str(int(avg_val_loss*1000)).zfill(4)+'.ckpt')
    
    print('epoch : {:3}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_tr_loss, avg_val_loss))
    
    threshold = 5
    if epoch >= 5:
        if all([prev_val_loss > val_loss for prev_val_loss 
                in val_loss_hist[epoch-threshold:epoch]]):
            break

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

epoch :   1, tr_loss : 1.279, val_loss : 0.619
epoch :   2, tr_loss : 0.592, val_loss : 0.499
epoch :   3, tr_loss : 0.519, val_loss : 0.457
epoch :   4, tr_loss : 0.484, val_loss : 0.434
epoch :   5, tr_loss : 0.463, val_loss : 0.420
epoch :   6, tr_loss : 0.446, val_loss : 0.411
epoch :   7, tr_loss : 0.434, val_loss : 0.406
epoch :   8, tr_loss : 0.421, val_loss : 0.402
epoch :   9, tr_loss : 0.411, val_loss : 0.399
epoch :  10, tr_loss : 0.400, val_loss : 0.398
epoch :  11, tr_loss : 0.392, val_loss : 0.398
epoch :  12, tr_loss : 0.381, val_loss : 0.395
epoch :  13, tr_loss : 0.374, val_loss : 0.396
epoch :  14, tr_loss : 0.364, val_loss : 0.396
epoch :  15, tr_loss : 0.356, val_loss : 0.397
epoch :  16, tr_loss : 0.348, val_loss : 0.397
epoch :  17, tr_loss : 0.340, val_loss : 0.398
epoch :  18, tr_loss : 0.330, val_loss : 0.398
epoch :  19, tr_loss : 0.324, val_loss : 0.401
epoch :  20, tr_loss : 0.316, val_loss : 0.408
epoch :  21, tr_loss : 0.308, val_loss : 0.408
epoch :  22, 

KeyboardInterrupt: 

In [79]:
tf.reset_default_graph()

### Test

In [93]:
saver.restore(sess, save_dir+'017_398')

INFO:tensorflow:Restoring parameters from checkpoints/017_398


INFO:tensorflow:Restoring parameters from checkpoints/017_398


In [94]:
tst_dataset = tf.data.Dataset.from_tensor_slices((X_tst, y_tst))
tst_dataset = tst_dataset.batch(batch_size = batch_size)
tst_iterator = tst_dataset.make_initializable_iterator()

In [95]:
tst_handle = sess.run(tst_iterator.string_handle())

In [96]:
y_tst_hat = np.array([])

sess.run(tst_iterator.initializer)

try:
    while True:
        y_tst_tmp = sess.run(morph_conv.prediction,
                            feed_dict = {handle : tst_handle,
                                         morph_conv.is_training : False})
        y_tst_hat= np.append(y_tst_hat,y_tst_tmp)

except tf.errors.OutOfRangeError:
    pass

In [97]:
print('test acc : {:.2%}'.format(np.mean(y_tst_hat == np.array(y_tst))))

test acc : 81.66%
