# Sentence classification by MorphConv
Implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) to classify sentiment of movie review

### Explanation of this notebook
* Dataset : [Naver sentiment movie corpus v1.0](https://github.com/e9t/nsmc)
    + train, validation : splitting `ratings_train.txt` (150k reviews) for train (120k reviews) and validation (30k reviews)
    + test : `ratings_test.txt` (50k reviews)
* Preprocessing
    + Morphological analysis by Mecab wrapped by [konlpy](http://konlpy.org/en/latest/)
    + Using [FastText](https://arxiv.org/abs/1607.04606) embedding by [gluonnlp package](https://gluon-nlp.mxnet.io/)

### Setup

In [1]:
import os, sys
import gluonnlp as nlp
import numpy as np
import pandas as pd
import tensorflow as tf
import itertools
import keras
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm_notebook as tqdm

print(tf.__version__)

# import khaiii
import re

1.8.0


Using TensorFlow backend.


### Loading dataset

In [2]:
ratings_train = pd.read_csv('data/ratings_train.txt', sep = '\t')[['document', 'label']]
ratings_test = pd.read_csv('data/ratings_test.txt', sep = '\t')[['document', 'label']]

# ratings, ratings_tst의 document column에 nan 값이 있으므로 이를 빈 문자열로 대체
print(sum(ratings_train.document.isna()), sum(ratings_test.document.isna()))

ratings_train.document[ratings_train.document.isna()] = ''
ratings_test.document[ratings_test.document.isna()] = ''

print(sum(ratings_train.document.isna()), sum(ratings_test.document.isna()))

5 3
0 0


### Use khaiii for preprocessing

In [3]:
def make_morphs(text):
#     tag_list = ['NNG', 'NNP', 'NNB', 'NP', 'NR', 'VV', 'VA', 
#             'VX', 'VCP', 'VCN', 'MM', 'MAG', 'MAJ', 'IC']
    tag_list = ['NNG', 'NNP', 'NP', 'NR', 'VV', 'VA', 'VX', 'VCP', 'VCN',
                'MM', 'MAG', 'MAJ', 'IC', 'SN', 'SW', 'SWK', 'SO', 'XR',
                'SH', 'SL', 'ZN', 'ZV', 'SP', 'SE']

#     text = re.sub('[^ㅎㅎ|^^|ㅡ|\-|~|;|♥|♡|★|ㅠ|ㅜ|a-z|A-Z|ㄱ-ㅎ|가-힣|\'|\"|\,|\.|\!|\?|\d]', 
#                   ' ', text)
    text = re.sub('(ㅡ.ㅡ|-.-)', ' ㅡㅡ ', text)
    text = re.sub('(ㅡ|-){2,}', ' ㅡㅡ ', text)
    text = re.sub('(ㄱ-ㅎ|^){2,}', ' \g<1>\g<1> ', text)
    text = re.sub('(♥|♡)+', ' ♥♥ ', text)
    text = re.sub('(★|;|~)+', ' \g<1>\g<1> ', text)
    text = re.sub('[ㅠ|ㅜ]+', ' ㅠ ', text)
    text = re.sub('\s+', ' ', text)
    
    text = text.strip()
    if not text:
        return ''

    result = api.analyze(text) 
    result_word = [m.lex+'다' if m.tag.startswith('V') else m.lex
                       for word in result
                          for m in word.morphs
                            if m.tag in tag_list]
    result_words = '+'.join(result_word)
    result_words = result_words.replace('^+^', '^^')
    result_words = result_words.replace('ㅠ', 'ㅠㅠ')
    return result_words

In [4]:
api = khaiii.KhaiiiApi()
api.open()

Path: /usr/local/lib/python3.6/dist-packages/khaiii


In [5]:
%%time
# train
print('Make train morphs......')
ratings_train['morphs'] = ratings_train['document'].apply(make_morphs)

# test
print('Make test morphs......')
ratings_test['morphs'] = ratings_test['document'].apply(make_morphs)

Make train morphs......
Make test morphs......
CPU times: user 3min 25s, sys: 400 ms, total: 3min 26s
Wall time: 3min 26s


In [6]:
api.close()

In [7]:
ratings_train.to_csv('data/ratings_train_khaiii.txt', sep='\t', index=False)
ratings_test.to_csv('data/ratings_test_khaiii.txt', sep='\t', index=False)

### Reload data

In [2]:
ratings = pd.read_csv('data/ratings_train_khaiii.txt', sep = '\t')[['morphs', 'label']]
ratings_test = pd.read_csv('data/ratings_test_khaiii.txt', sep = '\t')[['morphs', 'label']]

# ratings, ratings_tst의 document column에 nan 값이 있으므로 이를 빈 문자열로 대체
print(sum(ratings.morphs.isna()), sum(ratings_test.morphs.isna()))

ratings.morphs[ratings.morphs.isna()] = ''
ratings_test.morphs[ratings_test.morphs.isna()] = ''

print(sum(ratings.morphs.isna()), sum(ratings_test.morphs.isna()))

89 29
0 0


In [3]:
val_indices = np.random.choice(a = range(ratings.shape[0]), size = int(ratings.shape[0] * .2),
                               replace = False)
train_indices = np.delete(arr = range(ratings.shape[0]), obj = val_indices, axis = 0)

ratings_train = ratings.iloc[train_indices,:]
ratings_val = ratings.iloc[val_indices,:]

print(ratings_train.shape, ratings_val.shape, ratings_test.shape)

(120000, 2) (30000, 2) (50000, 2)


### Preprocessing dataset

In [4]:
def split_word(text):
#     return text.split('+')
    return [word for word in text.split('+')]

In [5]:
%%time
# train
print('Make train data......')
x_train_word = ratings_train.morphs.apply(split_word).tolist()
y_train = ratings_train.label.tolist()

# validation
print('Make validation data......')
x_val_word = ratings_val.morphs.apply(split_word).tolist()
y_val = ratings_val.label.tolist()

# test
print('Make test data......')
x_test_word = ratings_test.morphs.apply(split_word).tolist()
y_test = ratings_test.label.tolist()

Make train data......
Make validation data......
Make test data......
CPU times: user 729 ms, sys: 89.6 ms, total: 818 ms
Wall time: 816 ms


#### Building vocabulary and connecting vocabulary with fasttext embedding
https://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding.html

In [6]:
# training dataset 기반으로 vocab 생성
counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c in x_train_word]))
vocab = nlp.Vocab(counter,bos_token=None, eos_token=None, min_freq=5)

In [7]:
# Loading fasttext embedding 
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

# vocab에 embedding 연결
vocab.set_embedding(fasttext_simple)

In [8]:
%%time
# final preprocessing

x_train = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], x_train_word))
x_train = pad_sequences(sequences = x_train, maxlen = 20, padding = 'pre', value = 1.)

x_val = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], x_val_word))
x_val = pad_sequences(sequences = x_val, maxlen = 20, padding = 'pre', value = 1.)

x_test = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], x_test_word))
x_test = pad_sequences(sequences = x_test, maxlen = 20, padding = 'pre', value = 1.)

CPU times: user 3.88 s, sys: 105 ms, total: 3.99 s
Wall time: 3.99 s


### Define MorphConv class

In [9]:
class MorphConv:
    def __init__(self, X, y, n_of_classes, embedding):
        
        with tf.variable_scope('input_layer'):
            self.__X = X
            self.__y = y
            self.is_training = tf.placeholder(dtype = tf.bool)
        
        with tf.variable_scope('embedding_layer'):
            static_embed = tf.get_variable(name = 'static', initializer = embedding,
                                           trainable = False)
            non_static_embed = tf.get_variable(name = 'non_static', initializer = embedding,
                                               trainable = True)
            static_batch = tf.nn.embedding_lookup(params = static_embed, ids = self.__X)
            non_static_batch = tf.nn.embedding_lookup(params = non_static_embed, ids = self.__X)
            
        with tf.variable_scope('convoluion_layer'):
            with tf.variable_scope('tri_gram'):
                
                tri_gram = keras.layers.Conv1D(filters = 100, kernel_size = 3,
                                               activation = keras.activations.relu,
                                               kernel_initializer = 'he_uniform', padding = 'valid')
                static_3 = tri_gram(static_batch)
                non_static_3 = tri_gram(non_static_batch)
            
            with tf.variable_scope('tetra_gram'):
                tetra_gram = keras.layers.Conv1D(filters = 100, kernel_size = 4,
                                                 activation = keras.activations.relu,
                                                 kernel_initializer = 'he_uniform', padding = 'valid')
                
                static_4 = tetra_gram(static_batch)
                non_static_4 = tetra_gram(non_static_batch)
            
            with tf.variable_scope('penta_gram'):
                penta_gram = keras.layers.Conv1D(filters = 100, kernel_size = 5,
                                                 activation = keras.activations.relu,
                                                 kernel_initializer = 'he_uniform', padding = 'valid')
                
                static_5 = penta_gram(static_batch)
                non_static_5 = penta_gram(non_static_batch)

            fmap_3 = tf.reduce_max(static_3 + non_static_3, axis = 1)
            fmap_4 = tf.reduce_max(static_4 + non_static_4, axis = 1)
            fmap_5 = tf.reduce_max(static_5 + non_static_5, axis = 1)
            
        with tf.variable_scope('output_layer'):
            flattened = tf.concat([fmap_3, fmap_4, fmap_5], axis = -1)
            score = keras.layers.Dense(units = n_of_classes,
                                       kernel_regularizer = keras.regularizers.l2(.7))(flattened)
            
            self.__score = keras.layers.Dropout(rate = .5)(score, training = self.is_training)

        with tf.variable_scope('loss'):
            ce_loss = tf.losses.sparse_softmax_cross_entropy(labels = self.__y, logits = self.__score)
            reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
            self.total_loss = ce_loss + reg_term
        
        with tf.variable_scope('prediction'):
            self.prediction = tf.argmax(self.__score, axis = -1)
        
    # predict instance method for small dataset
    def predict(self, sess, x_data, is_training = False):
        feed_prediction = {self.__X : x_data, self.is_training : is_training}
        return sess.run(self.prediction, feed_dict = feed_prediction)

### Create a model of MorphConv

In [10]:
# hyper-parameter
lr = 0.003
epochs = 30
batch_size = 10000
total_step = int(x_train.shape[0] / batch_size)
print(total_step)

12


In [11]:
# train
tr_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
tr_dataset = tr_dataset.shuffle(buffer_size = 1000000)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()

In [12]:
# val
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size = batch_size)
val_iterator = val_dataset.make_initializable_iterator()

In [13]:
# anonymous iterator
handle = tf.placeholder(dtype = tf.string)
iterator = tf.data.Iterator.from_string_handle(string_handle = handle,
                                               output_types = tr_iterator.output_types,
                                               output_shapes = tr_iterator.output_shapes)
x_data, y_data = iterator.get_next()

In [14]:
morph_conv = MorphConv(X = x_data, y = y_data, n_of_classes = 2,
                       embedding = vocab.embedding.idx_to_vec.asnumpy())

In [15]:
# create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = morph_conv.total_loss)

In [16]:
saver = tf.train.Saver(max_to_keep=30)
save_dir = 'checkpoints/'
os.makedirs(save_dir, exist_ok=True)
os.system('rm -rf '+save_dir+'*ckpt*')

0

### Training

In [17]:
sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
# sess_config = tf.ConfigProto(device_count = {'GPU': 0})
sess = tf.Session(config = sess_config)
sess.run(tf.global_variables_initializer())
tr_handle, val_handle = sess.run(fetches = [tr_iterator.string_handle(), val_iterator.string_handle()])

In [18]:
%%time

train_loss_hist = []
val_loss_hist = []

for epoch in tqdm(range(epochs)):

    avg_train_loss = 0
    avg_val_loss = 0
    tr_step = 0
    val_step = 0

    # for mini-batch training
    sess.run(tr_iterator.initializer)    
    try:
        
        while True:
            _, train_loss = sess.run(fetches = [training_op, morph_conv.total_loss],
                                             feed_dict = {handle : tr_handle, morph_conv.is_training : True})
            avg_train_loss += train_loss
            tr_step += 1

    except tf.errors.OutOfRangeError:
        pass

    # for validation
    sess.run(val_iterator.initializer)
    try:
        while True:
            val_loss = sess.run(fetches = morph_conv.total_loss,
                                feed_dict = {handle : val_handle, morph_conv.is_training : False})
            avg_val_loss += val_loss
            val_step += 1
    
    except tf.errors.OutOfRangeError:
        pass

    avg_train_loss /= tr_step
    avg_val_loss /= val_step
    train_loss_hist.append(avg_train_loss)
    val_loss_hist.append(avg_val_loss)
    
    saver.save(sess=sess, 
               save_path=save_dir+str(epoch+1).zfill(3)+'_'+str(int(avg_train_loss*1000)).zfill(4)+'_'+str(int(avg_val_loss*1000)).zfill(4)+'.ckpt')
    
    print('epoch : {:3}, train_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_train_loss, avg_val_loss))
    
#     threshold = 5
#     if epoch >= 5:
#         print([prev_val_loss < avg_val_loss for prev_val_loss 
#                 in val_loss_hist[epoch-threshold:epoch]])
#         if all([prev_val_loss < avg_val_loss for prev_val_loss 
#                 in val_loss_hist[epoch-threshold:epoch]]):
#             break


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

epoch :   1, train_loss : 2.114, val_loss : 0.610
epoch :   2, train_loss : 0.742, val_loss : 0.544
epoch :   3, train_loss : 0.577, val_loss : 0.487
epoch :   4, train_loss : 0.512, val_loss : 0.448
epoch :   5, train_loss : 0.474, val_loss : 0.424
epoch :   6, train_loss : 0.452, val_loss : 0.408
epoch :   7, train_loss : 0.434, val_loss : 0.397
epoch :   8, train_loss : 0.420, val_loss : 0.390
epoch :   9, train_loss : 0.407, val_loss : 0.385
epoch :  10, train_loss : 0.395, val_loss : 0.381
epoch :  11, train_loss : 0.384, val_loss : 0.378
epoch :  12, train_loss : 0.374, val_loss : 0.377
epoch :  13, train_loss : 0.367, val_loss : 0.377
epoch :  14, train_loss : 0.358, val_loss : 0.377
epoch :  15, train_loss : 0.348, val_loss : 0.378
epoch :  16, train_loss : 0.344, val_loss : 0.380
epoch :  17, train_loss : 0.337, val_loss : 0.380
epoch :  18, train_loss : 0.327, val_loss : 0.382
epoch :  19, train_loss : 0.323, val_loss : 0.385
epoch :  20, train_loss : 0.315, val_loss : 0.387


In [19]:
# tf.reset_default_graph()

### Test

In [20]:
# saver.restore(sess, save_dir+'016_0381_0390.ckpt')

In [21]:
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.batch(batch_size = batch_size)
test_iterator = test_dataset.make_initializable_iterator()

In [22]:
test_handle = sess.run(test_iterator.string_handle())

In [23]:
y_test_hat = np.array([])

sess.run(test_iterator.initializer)

try:
    while True:
        y_test_tmp = sess.run(morph_conv.prediction,
                            feed_dict = {handle : test_handle,
                                         morph_conv.is_training : False})
        y_test_hat= np.append(y_test_hat, y_test_tmp)

except tf.errors.OutOfRangeError:
    pass

In [24]:
print('test acc : {:.2%}'.format(np.mean(y_test_hat == np.array(y_test))))

test acc : 82.14%


In [25]:
x_test[y_test_hat == np.array(y_test)]

array([[   1,    1,    1, ...,    1, 1073,   19],
       [   1,    1,    1, ...,    1,    1,    0],
       [   1,    1,    1, ..., 3384,   23,   21],
       ...,
       [   1,    1,    1, ...,  274, 2592,   22],
       [   1,    1,    1, ...,   37,    6,  225],
       [   1,    1,    1, ...,  137,   46,   12]], dtype=int32)