# 삼성 DS AI Expert Program

## Sentence Parsing Assignment

담당 조교 : 문승준 (june1212@kaist.ac.kr)

실습 일시: 2019년 10월 8일 (화), 13:30 - 17:30

## 2. Sentence Parsing with LSTM seq2seq

이번 파트에서는 LSTM seq2seq를 이용해서 Sentence Parsing을 진행해보겠습니다.

In [None]:
!pip install tensorflow-gpu==1.14.0
# !pip install tensorflow-gpu==2.0.0

In [1]:
import tensorflow as tf
import numpy as np
import pprint
import logging
import time
import nltk
import os
import random
import warnings

from pathlib import Path

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

tf.enable_eager_execution()

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Dataset load 하기

다음 함수들을 정의해서 Dataset을 load해줍니다.

In [2]:
def data_generator(f_path, params):
  with open(f_path) as f:
    print('Reading', f_path)
    for line in f:
      text_raw, text_tokenized, label = line.split('\t')
      text_tokenized = text_tokenized.lower().split()
      label = label.replace('[', '[ ').lower().split()
      source = [params['tgt2idx'].get(w, len(params['tgt2idx'])) for w in text_tokenized]
      target = [params['tgt2idx'].get(w, len(params['tgt2idx'])) for w in label]
      target_in = [1] + target
      target_out = target + [2]
      yield (source, target_in, target_out) #iterative하게 접근할 수 있도록 yield 함수를 사용해줍니다.

In [3]:
def dataset(is_training, params):
  _shapes = ([None], [None], [None])
  _types = (tf.int32, tf.int32, tf.int32)
  _pads = (0, 0, 0)
  
  if is_training:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['train_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.shuffle(params['buffer_size'])
    ds = ds.padded_batch(params['train_batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  else:
    ds = tf.data.Dataset.from_generator(
      lambda: data_generator(params['test_path'], params),
      output_shapes = _shapes,
      output_types = _types,)
    ds = ds.padded_batch(params['eval_batch_size'], _shapes, _pads)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  
  return ds

### Encoder 정의하기

Seq2seq 모델의 Encoder 부분을 먼저 정의해줍니다.

In [None]:
imbedding_npy = np.load('vocab/word.npy')

In [None]:
print(imbedding_npy.shape)
print(imbedding_npy)

In [4]:
class Encoder(tf.keras.Model):
  def __init__(self, params):
    super().__init__()
    
    ###########
    imbedding_npy = np.load('vocab/word.npy')
    
    self.embedding = tf.Variable(name="embedding_space",
                                 initial_value=imbedding_npy,
                                 dtype=tf.float32,
                                 trainable=False)
    
    # preprocess 과정에서 정의한 word.npy를 이용해 embedding될 수 있도록 해줍니다.
    
    ###########
    
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['rnn_units']//2,
                                                                      return_state=True,
                                                                      return_sequences=True))
  
  def call(self, inputs, training=False):
    if inputs.dtype != tf.int32:
      inputs = tf.cast(inputs, tf.int32)
    print('')
    
    x = tf.nn.embedding_lookup(self.embedding, inputs)
    
    x = self.dropout(x, training=training)
    
    encoder_outputs, state_fw_h, state_fw_c, state_bw_h, state_bw_c = self.encoder(x)
    
    state = (tf.concat((state_fw_h, state_bw_h), -1),
             tf.concat((state_fw_c, state_bw_c), -1),)
    state = tf.concat(state, -1)
    
    return (encoder_outputs, [state])

### Attention 정의하기

Decoder 부분에서 사용할 attention을 정의해줍니다. 여기서는 처음 attention decoder를 적용한 논문에서 소개된 BahdanauAttention을 적용합니다.

In [5]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units, use_bias=False)
    self.W2 = tf.keras.layers.Dense(units, use_bias=False)
    self.b = self.add_weight(shape=[units], name='bias')
    self.V = tf.keras.layers.Dense(1, use_bias=False)
    
  def call(self, query, values, values_mask):
    query = tf.expand_dims(query, 1)

    score = self.V(tf.tanh(self.W1(values) + self.W2(query) + self.b))
    score = tf.squeeze(score, -1)

    # pre-softmax masking
    paddings = tf.fill(tf.shape(score), float('-inf'))
    score = tf.where(tf.equal(values_mask, 0), paddings, score)
    
    align = tf.nn.softmax(score, axis=1)
    align = tf.expand_dims(align, -1)
    
    context_vector = tf.matmul(values, align, transpose_a=True)
    context_vector = tf.squeeze(context_vector, -1)

    return context_vector

Effective Approaches to Attention-based Neural Machine Translation에 소개된 LuongAttention은 BahdanauAttention보다 조금 더 좋은 성능을 나타낸다고 합니다.

Link : https://arxiv.org/pdf/1508.04025.pdf

## Assignment 1 : LuongAttention 구현하기

In [None]:
class LuongAttention(tf.keras.Model):
  def __init__(self):
    #### Assignment 1-1 ####
    super(LuongAttention, self).__init__()
    

  def call(self, query, values, values_mask):
    
    #### Assignment 1-2 ####

### Decoder 정의하기

Decoder 부분을 정의해줍니다. 여기 decoder layer에서는 attention을 적용합니다.

In [6]:
class Decoder(tf.keras.Model):
  def __init__(self, params, tied_embedding):
    super().__init__()
    self.embedding = tied_embedding
    self.attention = BahdanauAttention(params['rnn_units'])
    self.dropout = tf.keras.layers.Dropout(params['dropout_rate'])
    self.cell = tf.keras.layers.StackedRNNCells([
      tf.keras.layers.LSTMCell(params['rnn_units']),
      tf.keras.layers.LSTMCell(params['rnn_units']),
    ])
    self.out_bias = self.add_weight(name='out_bias', shape=[len(params['tgt2idx'])+1])
  
  
  @tf.function
  def call(self, inputs, training=False):
    inputs, states, memory, memory_mask = inputs
    
    if inputs.dtype != tf.int32:
      inputs = tf.cast(inputs, tf.int32)
    
    h0, c0, h1, c1 = tf.split(states, 4, axis=-1)
    
    context_vector = self.attention(h1, memory, memory_mask)
    
    x = tf.nn.embedding_lookup(self.embedding, inputs)
    
    x = tf.concat([context_vector, x], axis=-1)
    
    x = self.dropout(x, training=training)
    
    output, states = self.cell(x, ((h0, c0), (h1, c1)))
    
    logits = tf.matmul(output, self.embedding, transpose_b=True)
    logits = tf.nn.bias_add(logits, self.out_bias)
    
    states = tf.concat([states[0][0], states[0][1], states[1][0], states[1][1]], axis=-1)
    
    return logits, states

### Training the model

아래 함수처럼 한 step의 train을 정의합니다.

In [7]:
def train_step(source, target_in, target_out, encoder, decoder, params):
  loss = 0
  logits = []
  encoder_outputs, decoder_state = encoder(source, training=True)
  decoder_state = tf.concat(decoder_state + decoder_state, -1)
  
  for t in range(target_in.shape[1]):
    _logits, decoder_state = decoder([target_in[:, t],
                                      decoder_state,
                                      encoder_outputs,
                                      tf.sign(source)],
                                      training=True)
    logits.append(_logits)
  
  logits = tf.stack(logits, 1)
  
  loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=target_out, logits=logits)
  weights = tf.cast(tf.sign(target_in), tf.float32)
  loss = tf.reduce_sum(loss * weights) / tf.reduce_sum(weights)
  
  return loss

미리 pre-train된 vocabulary set(dictionary)을 불러옵니다.

In [8]:
def get_vocab(f_path):
  word2idx = {}
  with open(f_path, encoding='utf-8') as f:
    for i, line in enumerate(f):
      line = line.rstrip()
      word2idx[line] = i
  return word2idx

Early-stop을 위해서는 accuarcy가 감소하는지 체크해야합니다. 이를 위해서 아래와 같은 함수를 정의해줍니다.

In [9]:
def is_descending(history: list) -> bool:
  history = history[-(params['num_patience']+1):]
  for i in range(1, len(history)):
    if history[i-1] <= history[i]:
      return False
  return True

### Hyperparameter setting

Train에 필요한 hyperparameter들을 아래처럼 dictionary로 정의해줍니다.

In [10]:
params = {
    'train_path': './train.tsv',
    'test_path': './test.tsv',
    'vocab_src_path': './vocab/source.txt',
    'vocab_tgt_path': './vocab/target.txt',
    'model_path': './model/',
    'dropout_rate': 0.2,
    'rnn_units': 300,
    'max_decode_len': 50,
    'lr': 4e-4,
    'clip_norm': .1,
    'buffer_size': 31279,
    'train_batch_size': 32,
    'eval_batch_size': 128,
    'num_patience': 5,
}

In [11]:
params['tgt2idx'] = get_vocab(params['vocab_tgt_path'])
params['idx2tgt'] = {idx: tgt for tgt, idx in params['tgt2idx'].items()}

### Model 만들기

위에서 설정한 hyperparameter를 이용해서 model을 만들어줍니다.

In [12]:
encoder = Encoder(params)
encoder.build((None, None))
pprint.pprint([(v.name, v.shape) for v in encoder.trainable_variables])

decoder = Decoder(params, encoder.embedding)
decoder.build([[None], [None, 4*params['rnn_units']], [None, None, params['rnn_units']], [None, None]])
pprint.pprint([(v.name, v.shape) for v in decoder.trainable_variables])


[('bidirectional/forward_lstm/kernel:0',
  TensorShape([Dimension(300), Dimension(600)])),
 ('bidirectional/forward_lstm/recurrent_kernel:0',
  TensorShape([Dimension(150), Dimension(600)])),
 ('bidirectional/forward_lstm/bias:0', TensorShape([Dimension(600)])),
 ('bidirectional/backward_lstm/kernel:0',
  TensorShape([Dimension(300), Dimension(600)])),
 ('bidirectional/backward_lstm/recurrent_kernel:0',
  TensorShape([Dimension(150), Dimension(600)])),
 ('bidirectional/backward_lstm/bias:0', TensorShape([Dimension(600)]))]
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
[('bahdanau_attention/dense/kernel:0',
  TensorShape([Dimension(300), Dimension(300)])),
 ('bahdanau_attention/dense_1/kernel:0',
  TensorShape([Dimension(300), Dimension(300)])),
 ('bahdanau_attention/dense_2/kernel:0',
  TensorShape([Dimension(300), Dimension(1)])),
 ('bias:0', TensorShape([Dimension(300)])),
 ('stacked_rnn_cells/kernel:0', TensorShape([Dimension(600), Di

Learing Rate와 optimizer를 정의해줍니다.

In [13]:
decay_lr = tf.keras.optimizers.schedules.ExponentialDecay(params['lr'], 1000, 0.96)
optim = tf.keras.optimizers.Adam(params['lr'])
global_step = 0
history_acc = []
best_acc = .0
t0 = time.time()
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

임의의 문장에 대해서 현재까지 학습된 모델이 어떻게 parsing을 진행하는지 확인할 수 있도록, 아래와 같은 함수를 정의해줍니다.

In [14]:
def minimal_test(encoder, decoder, params):
  test_str = ['what', 'times', 'are', 'the', 'nutcracker', 'show', 'playing', 'near', 'me']
  test_arr = tf.convert_to_tensor([[params['tgt2idx'][w] for w in test_str]])
  generated = tf.convert_to_tensor([[1]])
  ids = generated[0]
  
  encoder_outputs, decoder_state = encoder(test_arr, training=False)
  decoder_state = tf.concat(decoder_state + decoder_state, -1)
  
  for i in range(params['max_decode_len']):
    logits, decoder_state = decoder([ids,
                                     decoder_state,
                                     encoder_outputs,
                                     tf.sign(test_arr)],
                                     training=False)
    ids = tf.argmax(logits, axis=-1, output_type=tf.int32)
    generated = tf.concat((generated, tf.expand_dims(ids, 1)), axis=1)
    if np.asscalar(ids.numpy()) == 2:
      break
  
  print('-'*12)
  print('minimal test')
  print('utterance:', ' '.join(test_str))
  parsed = ' '.join([params['idx2tgt'][idx] for idx in generated[0].numpy()[1:-1]])
  print('parsed:', parsed)
  print()
  try:
    nltk.tree.Tree.fromstring(parsed.replace('[ ', '(').replace(' ]', ')')).pretty_print()
  except:
    pass
  print('-'*12)

## Training을 시작해봅시다

실제 training을 시작해봅시다. Train set과 test set이 실제보다 크게 작아서 원하는 정확도는 얻기 힘들 것입니다.

In [None]:
warnings.filterwarnings('ignore')
variables = encoder.trainable_variables + decoder.trainable_variables
while True:
  # TRAINING
  for (source, target_in, target_out) in dataset(is_training=True, params=params):
    with tf.GradientTape() as tape:
      loss = train_step(source, target_in, target_out, encoder, decoder, params)
      
    optim.lr.assign(decay_lr(global_step))
    grads = tape.gradient(loss, variables)
    grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
    optim.apply_gradients(zip(grads, variables))
    
    if global_step % 10 == 0:
      logger.info("Step {} | Loss: {:.4f} | Spent: {:.1f} secs | LR: {:.6f}".format(
          global_step, loss.numpy().item(), time.time()-t0, optim.lr.numpy().item()))
      t0 = time.time()
    
    global_step += 1
  if global_step % 10 == 0:
    # EVALUATION
    is_training=False
    minimal_test(encoder, decoder, params)
    m = tf.keras.metrics.Mean()
    
    for i, (source, target_in, target_out) in enumerate(dataset(is_training=is_training, params=params)):
      generated = tf.ones((source.shape[0], 1), tf.int32)
      ids = tf.squeeze(generated, axis=1)
      encoder_outputs, decoder_state = encoder(source, training=is_training)
      decoder_state = tf.concat(decoder_state + decoder_state, -1)
      
      for j in range(target_out.shape[1]):
        logits, decoder_state = decoder([ids,
                                         decoder_state,
                                         encoder_outputs,
                                         tf.sign(source)],
                                         training=is_training)
        ids = tf.argmax(logits, axis=1, output_type=tf.int32)
        generated = tf.concat((generated, tf.expand_dims(ids, 1)), axis=1)

      seq_lens = tf.argmax(tf.cast(tf.equal(target_out, 2), tf.int32), axis=1)
      for pred, tgt, seq_len in zip(generated.numpy(), target_out.numpy(), seq_lens.numpy()):
        pred = pred[1:][:seq_len+1]
        tgt = tgt[:seq_len+1]
        matched = np.all(pred == tgt)
        m.update_state(int(matched))
    
    acc = m.result().numpy()
    logger.info("Evaluation: Testing Exact Match Accuracy: {:.3f}".format(acc))
    history_acc.append(acc)

    if acc > best_acc:
      best_acc = acc
    logger.info("Best Accuracy: {:.3f}".format(best_acc))

    if len(history_acc) > params['num_patience'] and is_descending(history_acc):
      logger.info("Testing Accuracy not improved over {} epochs, Early Stop".format(params['num_patience']))
      break
  else:
    continue

Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    
Reading ./train.tsv

INFO:tensorflow:Step 0 | Loss: 8.9669 | Spent: 10.7 secs | LR: 0.000400
Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv


------------
minimal test
utterance: what ti

INFO:tensorflow:Best Accuracy: 0.000
Reading ./train.tsv

INFO:tensorflow:Step 20 | Loss: 5.5908 | Spent: 4.9 secs | LR: 0.000400
Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv


------------
minimal test
utterance: what times are the nutcracker show playing near me
parsed: [ [ [ what what [ [ [ [ ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]

------------
Reading ./test.tsv

INFO:tensorflow:Evaluation: Testing Exact Match Accuracy: 0.000
INFO:tensorflow:Best Accuracy: 0.000
Reading ./train.tsv

INFO:tensorflow:Step 30 | Loss: 5.0502 | Spent: 4.9 secs | LR: 0.000400
Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv


------------
minimal test
utterance: what times are the nutcra

INFO:tensorflow:Evaluation: Testing Exact Match Accuracy: 0.000
INFO:tensorflow:Best Accuracy: 0.000
Reading ./train.tsv

INFO:tensorflow:Step 140 | Loss: 1.8372 | Spent: 4.4 secs | LR: 0.000398
Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv


------------
minimal test
utterance: what times are the nutcracker show playing near me
parsed: [ therer what what [ is at [ arethe [ arethe [ the house ] ] ] ] the what week ] [ starring released ] starring yesterday ] ] starring it it ] starring ] abeer yesterday ] ] abeer it ] ] abeer it yesterday ] ]

------------
Reading ./test.tsv

INFO:tensorflow:Evaluation: Testing Exact Match Accuracy: 0.000
INFO:tensorflow:Best Accuracy: 0.000
Reading ./train.tsv

INFO:tensorflow:Step 150 | Loss: 1.7495 | Spent: 4.6 secs | LR: 0.000398
Reading ./train.tsv

Reading ./train.tsv

Reading ./train.tsv

Reading ./train.t

## Assignment 2: Seq2seq with GRU

앞 실습에서는 seq2seq의 Base RNN unit을 LSTM으로 사용했습니다. 이를 GRU로 대체해서 Encoder, Decoder를 새롭게 설계해서 제출해주세요.

In [None]:
class Encoder(tf.keras.Model):
    
    #### Assignment 2-1 ####
    
class Decoder(tf.keras.Model):
    
    #### Assignment 2-2 ####
    