# Sentence classification by MorphConv
Implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) to classify sentiment of movie review

### Explanation of this notebook
* Dataset : [Naver sentiment movie corpus v1.0](https://github.com/e9t/nsmc)
    + train, validation : splitting `ratings_train.txt` (150k reviews) for train (120k reviews) and validation (30k reviews)
    + test : `ratings_test.txt` (50k reviews)
* Preprocessing
    + Morphological analysis by Mecab wrapped by [konlpy](http://konlpy.org/en/latest/)
    + Using [FastText](https://arxiv.org/abs/1607.04606) embedding by [gluonnlp package](https://gluon-nlp.mxnet.io/)

### Setup

In [1]:
import os, sys
from konlpy.tag import Mecab
import gluonnlp as nlp
import numpy as np
import pandas as pd
import tensorflow as tf
import itertools
import keras
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm_notebook as tqdm
from pprint import pprint

import re
from sklearn.model_selection import train_test_split

print(tf.__version__)

1.8.0


Using TensorFlow backend.


### Loading dataset

In [125]:
ratings_train = pd.read_csv('data/ratings_train.txt', sep = '\t')[['document', 'label']]
ratings_test = pd.read_csv('data/ratings_test.txt', sep = '\t')[['document', 'label']]

# ratings, ratings_tst의 document column에 nan 값이 있으므로 이를 빈 문자열로 대체
print(sum(ratings_train.document.isna()), sum(ratings_test.document.isna()))

ratings_train.document[ratings_train.document.isna()] = ''
ratings_test.document[ratings_test.document.isna()] = ''

print(sum(ratings_train.document.isna()), sum(ratings_test.document.isna()))

5 3
0 0


### Use mecab for preprocessing

In [126]:
def make_morphs(text):
    cleaned_text = re.sub('[^a-z|A-Z|가-힣|0-9|\,|\.|\!|\?]', ' ', text)
    cleaned_text = re.sub('(\!|\?){2,}', '\g<1>', cleaned_text)
    cleaned_text = cleaned_text.strip()
    base_words = mecab.morphs(cleaned_text)
    base_words = [word for word in base_words if '.' != word and ',' not in word]
    base_words = [word if '..' not in word else '...' for word in base_words]
    base_words = [word if word != '!' else '!!' for word in base_words]
    base_words = [word if word != '?' else '??' for word in base_words]

    
    sp_text = re.sub('[^ㅎㅎ|^^|ㅡㅡ|\-\-|~|;|♥|♡|★|ㅠ|ㅜ|ㅋ|ㅎ|ㅇ|ㅂ|ㅅ|ㅊ|ㅈ|ㄷ|ㄴ|ㅌ]', 
                     ' ', text)
    
    sp_text = re.sub('(ㅡ.ㅡ|-.-)', ' ㅡㅡ ', sp_text)
    sp_text = re.sub('(-|ㅡ){2,}', ' ㅡㅡ ', sp_text)
    sp_text = re.sub('(ㅋ|ㅎ|ㅇ|ㅂ|ㅅ|ㅊ|ㅈ|ㄷ|ㄴ|ㅌ){2,}', ' \g<1>\g<1> ', sp_text)
    sp_text = re.sub('(♥|♡)+', ' ♥♥ ', sp_text)
    sp_text = re.sub('(★|;|~)+', ' \g<1>\g<1> ', sp_text)
    sp_text = re.sub('\^+', ' ^^ ', sp_text)
    sp_text = re.sub('[ㅠ|ㅜ]+', ' ㅠㅠ ', sp_text)
    sp_text = re.sub('\s+', ' ', sp_text)
    sp_text = sp_text.strip()
    
    sp_words = sp_text.split(' ')
    sp_words = [word for word in sp_words if 'ㅡ' != word]
    
    result_text = base_words + sp_words if not '' in sp_words else base_words
    if not result_text:
        return ''
    
#     result_word = [word for word in result_word if len(word) > 1]
    result_words = '+'.join(result_text)
    return result_words

In [127]:
%%time

mecab = Mecab()

# train
print('Make train morphs......')
ratings_train['morphs'] = ratings_train['document'].apply(make_morphs)

# test
print('Make test morphs......')
ratings_test['morphs'] = ratings_test['document'].apply(make_morphs)

Make train morphs......
Make test morphs......
CPU times: user 40.2 s, sys: 135 ms, total: 40.3 s
Wall time: 40.3 s


In [128]:
ratings_train.to_csv('data/ratings_train_mecab_spword.txt', sep='\t', index=False)
ratings_test.to_csv('data/ratings_test_mecab_spword.txt', sep='\t', index=False)

### Reload data

In [2]:
ratings = pd.read_csv('data/ratings_train_mecab_spword.txt', sep = '\t')[['morphs', 'label']]
ratings_test = pd.read_csv('data/ratings_test_mecab_spword.txt', sep = '\t')[['morphs', 'label']]

# ratings, ratings_tst의 document column에 nan 값이 있으므로 이를 빈 문자열로 대체
print(sum(ratings.morphs.isna()), sum(ratings_test.morphs.isna()))

ratings.morphs[ratings.morphs.isna()] = ''
ratings_test.morphs[ratings_test.morphs.isna()] = ''

print(sum(ratings.morphs.isna()), sum(ratings_test.morphs.isna()))

167 56
0 0


### Preprocessing dataset

##### Find best random seed trought rasidual and varience

In [132]:
def split_word(text):
    return [word for word in text.split('+')]

In [None]:
print('Find best seed......')

min_seed = 0
min_residual = 100000000
for i in tqdm(range(1000)):
    x_data = ratings.morphs.apply(split_word).tolist()
    y_data = ratings.label.tolist()

    x_train_word, x_val_word, y_train, y_val = train_test_split(x_data, y_data,
                                                                test_size=0.2,
                                                                random_state=i,
                                                                stratify=y_data)
    # print(len(y_train), sum(y_train), len(y_val), sum(y_val))

    word_table = list(set([word for words in x_data for word in words]))
    word_table = {word:0 for word in word_table}

    train_counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c in x_train_word]))
    train_table = word_table.copy()
    train_table.update(train_counter)

    val_counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c in x_val_word]))
    val_table = word_table.copy()
    val_table.update(val_counter)

    train_cnt = np.array(list(train_table.values()))
    val_cnt = np.array(list(val_table.values())) * 4
  
    residual = np.abs(train_cnt-val_cnt).sum()
    if residual < min_residual:
        min_residual = residual
        min_seed = i
        print('seed:', i)
        print(min_residual)
        print(np.var(train_cnt), np.var(val_cnt))

Find best seed......


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

seed: 0
242429
475470.9748563272 470837.2030298952
seed: 1
238699
474388.50373664836 475138.8543619756
seed: 76
238327
474760.9501186432 473640.1993358826
seed: 97
238175
474743.9294321713 473710.0265871386
seed: 321
237533
474788.4582905639 473523.6408033082
seed: 349
236630
474491.4691436772 474693.0691265234
seed: 407
236418
475148.4852283042 472077.4220420267


### Make training data 

In [3]:
def split_word(text):
    return [word for word in text.split('+')]

In [4]:
print('Make train, val data......')

x_data = ratings.morphs.apply(split_word).tolist()
y_data = ratings.label.tolist()

x_train_word, x_val_word, y_train, y_val = train_test_split(x_data, y_data,
                                                            test_size=0.2,
                                                            random_state=349,
                                                            stratify=y_data)

y_train = np.asarray(y_train)
y_val = np.asarray(y_val)

Make train, val data......


### Preprocessing dataset

#### Building vocabulary and connecting vocabulary with fasttext embedding
https://gluon-nlp.mxnet.io/examples/word_embedding/word_embedding.html

In [5]:
# training dataset 기반으로 vocab 생성
counter = nlp.data.count_tokens(itertools.chain.from_iterable([c for c in x_train_word]))
vocab = nlp.Vocab(counter,bos_token=None, eos_token=None, min_freq=15)

In [6]:
# Loading fasttext embedding 
fasttext_simple = nlp.embedding.create('fasttext', source='wiki.ko')

# vocab에 embedding 연결
vocab.set_embedding(fasttext_simple)

In [129]:
%%time
# final preprocessing

x_train = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], x_train_word))
x_train = pad_sequences(sequences = x_train, maxlen = 30, padding = 'post', value = 1.)

x_val = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], x_val_word))
x_val = pad_sequences(sequences = x_val, maxlen = 30, padding = 'post', value = 1.)

CPU times: user 3.97 s, sys: 211 ms, total: 4.18 s
Wall time: 4.22 s


### Define MorphConv class

In [130]:
class MorphConv:
    def __init__(self, X, y, n_of_classes, embedding):
        
        with tf.variable_scope('input_layer'):
            self.__X = X
            self.__y = y
            self.is_training = tf.placeholder(dtype = tf.bool)
        
        with tf.variable_scope('embedding_layer'):
            static_embed = tf.get_variable(name = 'static', initializer = embedding,
                                           trainable = False)
            non_static_embed = tf.get_variable(name = 'non_static', initializer = embedding,
                                               trainable = True)
            static_batch = tf.nn.embedding_lookup(params = static_embed, ids = self.__X)
            non_static_batch = tf.nn.embedding_lookup(params = non_static_embed, ids = self.__X)
            
        with tf.variable_scope('convoluion_layer'):
            with tf.variable_scope('tri_gram'):
                
                tri_gram = keras.layers.Conv1D(filters = 100, kernel_size = 3,
                                               activation = keras.activations.relu,
                                               kernel_initializer = 'he_uniform', padding = 'valid')
                static_3 = tri_gram(static_batch)
                non_static_3 = tri_gram(non_static_batch)
            
            with tf.variable_scope('tetra_gram'):
                tetra_gram = keras.layers.Conv1D(filters = 100, kernel_size = 4,
                                                 activation = keras.activations.relu,
                                                 kernel_initializer = 'he_uniform', padding = 'valid')
                
                static_4 = tetra_gram(static_batch)
                non_static_4 = tetra_gram(non_static_batch)
            
            with tf.variable_scope('penta_gram'):
                penta_gram = keras.layers.Conv1D(filters = 100, kernel_size = 5,
                                                 activation = keras.activations.relu,
                                                 kernel_initializer = 'he_uniform', padding = 'valid')
                
                static_5 = penta_gram(static_batch)
                non_static_5 = penta_gram(non_static_batch)

            fmap_3 = tf.reduce_max(static_3 + non_static_3, axis = 1)
            fmap_4 = tf.reduce_max(static_4 + non_static_4, axis = 1)
            fmap_5 = tf.reduce_max(static_5 + non_static_5, axis = 1)
            
        with tf.variable_scope('output_layer'):
            flattened = tf.concat([fmap_3, fmap_4, fmap_5], axis = -1)
            score = keras.layers.Dense(units = n_of_classes,
                                       kernel_regularizer = keras.regularizers.l2(.7))(flattened)
            
            self.__score = keras.layers.Dropout(rate = .5)(score, training = self.is_training)

        with tf.variable_scope('loss'):
            ce_loss = tf.losses.sparse_softmax_cross_entropy(labels = self.__y, logits = self.__score)
            reg_term = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
            self.total_loss = ce_loss + reg_term
        
        with tf.variable_scope('prediction'):
            self.prediction = tf.argmax(self.__score, axis = -1)
        
    # predict instance method for small dataset
    def predict(self, sess, x_data, is_training = False):
        feed_prediction = {self.__X : x_data, self.is_training : is_training}
        return sess.run(self.prediction, feed_dict = feed_prediction)

### Create a model of MorphConv

In [157]:
# hyper-parameter
lr = .001
epochs = 30
batch_size = 100
train_step = int(x_train.shape[0] / batch_size)
val_step = int(x_val.shape[0] / batch_size)
print(train_step, val_step)

1200 300


In [158]:
# train
tr_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
tr_dataset = tr_dataset.shuffle(buffer_size = 1000000)
tr_dataset = tr_dataset.batch(batch_size = batch_size)
tr_iterator = tr_dataset.make_initializable_iterator()

In [159]:
# val
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size = batch_size)
val_iterator = val_dataset.make_initializable_iterator()

In [160]:
# anonymous iterator
handle = tf.placeholder(dtype = tf.string)
iterator = tf.data.Iterator.from_string_handle(string_handle = handle,
                                               output_types = tr_iterator.output_types,
                                               output_shapes = tr_iterator.output_shapes)
x_data, y_data = iterator.get_next()

In [161]:
morph_conv = MorphConv(X = x_data, y = y_data, n_of_classes = 2,
                       embedding = vocab.embedding.idx_to_vec.asnumpy())

In [162]:
# create training op
opt = tf.train.AdamOptimizer(learning_rate = lr)
training_op = opt.minimize(loss = morph_conv.total_loss)

In [163]:
saver = tf.train.Saver(max_to_keep=30)
save_dir = 'checkpoints/'
os.makedirs(save_dir, exist_ok=True)
os.system('rm -rf '+save_dir+'*ckpt*')

0

### Training

In [164]:
sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
# sess_config = tf.ConfigProto(device_count = {'GPU': 1})
sess = tf.Session(config = sess_config)
sess.run(tf.global_variables_initializer())
tr_handle, val_handle = sess.run(fetches = [tr_iterator.string_handle(), val_iterator.string_handle()])

In [165]:
%%time

train_loss_hist = []
val_loss_hist = []

for epoch in tqdm(range(epochs)):

    avg_train_loss = 0
    avg_val_loss = 0
    tr_step = 0
    val_step = 0

    # for mini-batch training
    sess.run(tr_iterator.initializer)    
    try:
        
        while True:
            _, train_loss = sess.run(fetches = [training_op, morph_conv.total_loss],
                                             feed_dict = {handle : tr_handle, morph_conv.is_training : True})
            avg_train_loss += train_loss
            tr_step += 1

    except tf.errors.OutOfRangeError:
        pass

    # for validation
    sess.run(val_iterator.initializer)
    try:
        while True:
            val_loss = sess.run(fetches = morph_conv.total_loss,
                                feed_dict = {handle : val_handle, morph_conv.is_training : False})
            avg_val_loss += val_loss
            val_step += 1
    
    except tf.errors.OutOfRangeError:
        pass

    avg_train_loss /= tr_step
    avg_val_loss /= val_step
    train_loss_hist.append(avg_train_loss)
    val_loss_hist.append(avg_val_loss)
    
    saver.save(sess=sess, 
               save_path=save_dir+str(epoch+1).zfill(3)+'_'+str(int(avg_train_loss*1000)).zfill(4)+'_'+str(int(avg_val_loss*1000)).zfill(4)+'.ckpt')
    
    print('epoch : {:3}, train_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, avg_train_loss, avg_val_loss))
    
#     threshold = 5
#     if epoch >= 5:
#         print([prev_val_loss < avg_val_loss for prev_val_loss 
#                 in val_loss_hist[epoch-threshold:epoch]])
#         if all([prev_val_loss < avg_val_loss for prev_val_loss 
#                 in val_loss_hist[epoch-threshold:epoch]]):
#             break


HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

epoch :   1, train_loss : 0.528, val_loss : 0.364
epoch :   2, train_loss : 0.425, val_loss : 0.337
epoch :   3, train_loss : 0.392, val_loss : 0.331
epoch :   4, train_loss : 0.366, val_loss : 0.341
epoch :   5, train_loss : 0.338, val_loss : 0.355
epoch :   6, train_loss : 0.308, val_loss : 0.347
epoch :   7, train_loss : 0.285, val_loss : 0.393
epoch :   8, train_loss : 0.262, val_loss : 0.409
epoch :   9, train_loss : 0.245, val_loss : 0.457
epoch :  10, train_loss : 0.234, val_loss : 0.560
epoch :  11, train_loss : 0.222, val_loss : 0.535
epoch :  12, train_loss : 0.218, val_loss : 0.562
epoch :  13, train_loss : 0.213, val_loss : 0.673
epoch :  14, train_loss : 0.212, val_loss : 0.615
epoch :  15, train_loss : 0.204, val_loss : 0.660
epoch :  16, train_loss : 0.207, val_loss : 0.685
epoch :  17, train_loss : 0.203, val_loss : 0.727
epoch :  18, train_loss : 0.199, val_loss : 0.785
epoch :  19, train_loss : 0.198, val_loss : 0.813
epoch :  20, train_loss : 0.199, val_loss : 0.808


In [156]:
tf.reset_default_graph()

### Test

In [177]:
saver.restore(sess, save_dir+'003_0391_0331.ckpt')

INFO:tensorflow:Restoring parameters from checkpoints/003_0391_0331.ckpt


INFO:tensorflow:Restoring parameters from checkpoints/003_0391_0331.ckpt


In [178]:
# Make test data
print('Make test data......')
x_test_word = ratings_test.morphs.apply(split_word).tolist()

x_test = list(map(lambda sen : [vocab.token_to_idx[token] for token in sen], x_test_word))
x_test = pad_sequences(sequences = x_test, maxlen = 30, padding = 'post', value = 1.)

y_test = ratings_test.label.tolist()
y_test = np.asarray(y_test)

Make test data......


In [179]:
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.batch(batch_size = batch_size)
test_iterator = test_dataset.make_initializable_iterator()

In [180]:
test_handle = sess.run(test_iterator.string_handle())

In [181]:
y_test_hat = np.array([])

sess.run(test_iterator.initializer)

try:
    while True:
        y_test_tmp = sess.run(morph_conv.prediction,
                            feed_dict = {handle : test_handle,
                                         morph_conv.is_training : False})
        y_test_hat= np.append(y_test_hat, y_test_tmp)

except tf.errors.OutOfRangeError:
    pass

In [182]:
print('test acc : {:.2%}'.format(np.mean(y_test_hat == np.array(y_test))))

test acc : 85.17%
