# Deep Learning Series

<br>
<span style="color:gray">

1. Neural Net - part.1


2. Convolution Neural Network


3. Neural Net - part.2


</span>


<b>

4. Recursive Nerural Network


</b>

# Recurrent Neural Network

## What is Recurrent Neural Network ?

RNN은 RNN이다

## Computation in Recurrent Neural Network Layer

<img src="img/RNN_01.PNG">

In [2]:
import numpy as np

In [3]:
txt_data = "abcdefghijklmnopqrstuvwxyz "

chars = list(set(txt_data))

num_chars = len(chars) #the number of unique characters
txt_data_size = len(txt_data)

print("unique characters : ", num_chars)
print("txt_data_size : ", txt_data_size)

unique characters :  27
txt_data_size :  27


In [4]:
# one hot encode
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

print(char_to_int,"\n")
print(int_to_char,"\n")

integer_encoded = [char_to_int[i] for i in txt_data]
print(integer_encoded, "\n")
print("data length: ",len(integer_encoded))

{'u': 0, 'a': 1, 'w': 2, 'j': 3, 'p': 4, 'f': 5, 'h': 6, 'z': 7, 'n': 8, 't': 9, ' ': 10, 'q': 11, 'g': 12, 'r': 13, 's': 14, 'e': 15, 'v': 16, 'd': 17, 'x': 18, 'c': 19, 'm': 20, 'l': 21, 'y': 22, 'k': 23, 'b': 24, 'o': 25, 'i': 26} 

{0: 'u', 1: 'a', 2: 'w', 3: 'j', 4: 'p', 5: 'f', 6: 'h', 7: 'z', 8: 'n', 9: 't', 10: ' ', 11: 'q', 12: 'g', 13: 'r', 14: 's', 15: 'e', 16: 'v', 17: 'd', 18: 'x', 19: 'c', 20: 'm', 21: 'l', 22: 'y', 23: 'k', 24: 'b', 25: 'o', 26: 'i'} 

[1, 24, 19, 17, 15, 5, 12, 6, 26, 3, 23, 21, 20, 8, 25, 4, 11, 13, 14, 9, 0, 16, 2, 18, 22, 7, 10] 

data length:  27


In [14]:
iteration = 10000
sequence_length = 10
batch_size = round((txt_data_size / sequence_length) + 0.5)
hidden_size = 100
learning_rate = 1e-1

W_xh = np.random.randn(hidden_size, num_chars)*0.01
W_hh = np.random.randn(hidden_size, hidden_size)*0.01
W_hy = np.random.randn(num_chars, hidden_size)*0.01

b_h = np.zeros((hidden_size, 1)) # hidden bias
b_y = np.zeros((num_chars, 1)) # output bias

h_prev = np.zeros((hidden_size, 1)) # h_(t-1)

In [15]:
def forwardprop(inputs, targets, h_prev):
        
    # Since the RNN receives the sequence, the weights are not updated during one sequence.
    xs, hs, ys, ps = {}, {}, {}, {} # dictionary
    hs[-1] = np.copy(h_prev) # Copy previous hidden state vector to -1 key value.
    loss = 0 # loss initialization
    
    for t in range(len(inputs)): # t is a "time step" and is used as a key(dic).  
        
        xs[t] = np.zeros((num_chars,1)) 
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh( np.dot(W_xh, xs[t]) + np.dot(W_hh, hs[t-1]) + b_h ) # hidden state. 
        ys[t] = np.dot(W_hy, hs[t]) + b_y # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars. 
        # Softmax. -> The sum of probabilities is 1 even without the exp() function, but all of the elements are positive through the exp() function.
 
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss). Efficient and simple code
        print(loss)
    return loss, ps, hs, xs

<img src="img/RNN_03.PNG">

In [20]:
def backprop(ps, inputs, hs, xs):

    dWxh, dWhh, dWhy = np.zeros_like(W_xh), np.zeros_like(W_hh), np.zeros_like(W_hy) # make all zero matrices.
    dbh, dby = np.zeros_like(b_h), np.zeros_like(b_y)
    dhnext = np.zeros_like(hs[0]) # (hidden_size,1) 

    # reversed
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t]) # shape (num_chars,1).  "dy" means "dloss/dy"
        dy[targets[t]] -= 1 # backprop into y. After taking the soft max in the input vector, subtract 1 from the value of the element corresponding to the correct label.
        dWhy += np.dot(dy, hs[t].T)
        dby += dy 
        dh = np.dot(W_hy.T, dy) + dhnext # backprop into h. 
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity #tanh'(x) = 1-tanh^2(x)
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(W_hh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]: 
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients.  
    
    return dWxh, dWhh, dWhy, dbh, dby

In [21]:
data_pointer = 0

# memory variables for Adagrad
mWxh, mWhh, mWhy = np.zeros_like(W_xh), np.zeros_like(W_hh), np.zeros_like(W_hy)
mbh, mby = np.zeros_like(b_h), np.zeros_like(b_y) 


for i in range(iteration):
    h_prev = np.zeros((hidden_size,1)) # reset RNN memory
    data_pointer = 0 # go from start of data
    
    for b in range(batch_size):
        
        inputs = [char_to_int[ch] for ch in txt_data[data_pointer:data_pointer+sequence_length]]
        targets = [char_to_int[ch] for ch in txt_data[data_pointer+1:data_pointer+sequence_length+1]] # t+1        
            
        if (data_pointer+sequence_length+1 >= len(txt_data) and b == batch_size-1): # processing of the last part of the input data. 
            targets.append(char_to_int[" "])   # When the data doesn't fit, add space(" ") to the back.


        # forward
        loss, ps, hs, xs = forwardprop(inputs, targets, h_prev)

        
        # backward
        dWxh, dWhh, dWhy, dbh, dby = backprop(ps, inputs, hs, xs) 
        
        
    # perform parameter update with Adagrad
        for param, dparam, mem in zip([W_xh, W_hh, W_hy, b_h, b_y], 
                                    [dWxh, dWhh, dWhy, dbh, dby], 
                                    [mWxh, mWhh, mWhy, mbh, mby]):
            mem += dparam * dparam # elementwise
            param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update      
    
        data_pointer += sequence_length # move data pointer
        
    if i % 1000 == 0:
        print ('iter %d, loss: %f' % (i, loss)) # print progress

iter 0, loss: 69.105512
iter 1000, loss: 0.002753
iter 2000, loss: 0.001419
iter 3000, loss: 0.000985
iter 4000, loss: 0.000759
iter 5000, loss: 0.000618
iter 6000, loss: 0.000522
iter 7000, loss: 0.000454
iter 8000, loss: 0.000402
iter 9000, loss: 0.000360


In [22]:
def predict(test_char, length):
    x = np.zeros((num_chars, 1)) 
    x[char_to_int[test_char]] = 1
    ixes = []
    h = np.zeros((hidden_size,1))

    for t in range(length):
        h = np.tanh(np.dot(W_xh, x) + np.dot(W_hh, h) + b_h) 
        y = np.dot(W_hy, h) + b_y
        p = np.exp(y) / np.sum(np.exp(y)) 
        ix = np.random.choice(range(num_chars), p=p.ravel()) # ravel -> rank0
        # "ix" is a list of indexes selected according to the soft max probability.
        x = np.zeros((num_chars, 1)) # init
        x[ix] = 1 
        ixes.append(ix) # list
    txt = ''.join(int_to_char[i] for i in ixes)
    print ('----\n %s \n----' % (txt, ))

In [25]:
predict('a',10) # (char, len of output)

----
 bcdefghijk 
----


In [36]:
predict('b',10)

----
 bdxqrs opq 
----


In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data, datasets

In [None]:
BATCH_SIZE = 64
lr = 0.001
EPOCHS = 10
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [None]:
TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.Field(sequential=False, batch_first=True)

data.Field의 인자들

sequential ::: 데이터셋이 순차적인 데이터셋인지 명시해준다. 이 때 LABEL 값은 단순히 클래스를 

batch_first ::: 파리미터로 신경망에 입력되는 텐서의 첫 번째 차원값이 batch_size가 되도록 정해준다.

lower ::: 텍스트 데이터 속 모든 영문 알파벳이 소문자가 되도록 처리해준다.

에러 내용 :: RuntimeError: multi-target not supported :: model을 train할 때 발생

원인 :: LABEL Field의 sequential 인자를 True로 지정함.
sequential을 True로 지정하게 되면 Tokenization이 진행됨.
> sequential – Whether the datatype represents sequential data. If False, no tokenization is applied. Default: True.

In [None]:
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
TEXT.build_vocab(trainset, min_freq=5)
LABEL.build_vocab(trainset)

min_freq ::: 학습데이터에서 최소 5번 이상 등장한 단어만 사전에 추가한다. 그 외에는 unk라는 토큰으로 대체한다.

In [None]:
trainset, valset = trainset.split(split_ratio=0.8)
train_iter, val_iter, test_iter = data.BucketIterator.splits((trainset, valset, testset),
                                                            batch_size=BATCH_SIZE,
                                                            shuffle=True, repeat=False)

data.BucketIterator ::: 

In [None]:
vocab_size = len(TEXT.vocab)
n_classes = 2

In [None]:
TEXT.vocab

In [None]:
print("[학습셋]: %d [검증셋]: %d [테스트셋]: %d [단어수]: %d [클래스]: %d" % (len(trainset), len(valset), len(testset), vocab_size, n_classes))

In [None]:
class BasicRNN(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(BasicRNN, self).__init__()
        print("Building Basic RNN model")
        
        self.n_layers = n_layers #아주 복잡한 모델이 아닌 이상 n_layers는 2이하로 설정한다.
        self.embed = nn.Embedding(n_vocab, embed_dim) #
        self.hidden_dim = hidden_dim #RNN을 통해 형성되는 은닉벡터의 차원값
        
        self.dropout = nn.Dropout(dropout_p)
        
        self.rnn = nn.RNN(embed_dim, self.hidden_dim,
                         num_layers=self.n_layers,
                         batch_first=True)
        
        self.out = nn.Linear(self.hidden_dim, n_classes)
    
    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0))
        x, _ = self.rnn(x, h_0)
        h_t = x[:,-1,:]
        self.dropout(h_t)
        logit = self.out(h_t)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_() #zero_()는 텐서 내 모든 값을 0으로 초기화한다.

Embedding<br>
임베딩(embedding)은 자연어를 숫자의 나열인 벡터로 바꾼 결과 혹은 그 일련의 과정 전체를 가리키는 용어입니다. 
단어나 문장 각각을 벡터로 변환해 벡터 공간에 ‘끼워 넣는다(embed)’는 취지에서 임베딩이라는 이름이 붙었습니다. 
컴퓨터가 자연어를 처리할 수 있게 하려면 자연어를 계산 가능한 형식인 임베딩으로 바꿔줘야 합니다.

Drop Out<br>
드롭아웃 기법은 뉴런의 연결을 임의로 삭제하는 것이다. 훈련할 때 임의의 뉴런을 골라 삭제하여 신호를 전달하지 않게 한다. 테스트할 때는 모든 뉴런을 사용한다. 신경망이 깊어짐에 따라 발생하는 over-fitting 문제를 해결하기 위함이다.

In [None]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)
        optimizer.zero_grad()
        
        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [None]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1) # 레이블값을 0과 1로 변환
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [None]:
model = BasicRNN(1, 256, vocab_size, 128, n_classes, 0.5).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
best_val_loss = None
for e in range(1, EPOCHS+1):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)
    
    print("[이폭: %d] 검증 오차:%5.2f | 검증 정확도:%5.2f" % (e, val_loss, val_accuracy))
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(model.state_dict(),
                  './snapshot/txtclassification.pt')
        best_val_loss = val_loss

## BPTT (Back Propagation Through Time)

<img src="img/RNN_02.PNG">

RNN의 $h$는 '상태(state)'를 기억해 시각이 1 스텝 (1단위;1t) 진행될 때마다 $h = \text{tanh}(h_{t-1}W_h + x_tW_x + b)$의 형태로 갱신이 된다.

보통 RNN의 출력 $h_t$를 은닉 상태(hidden state) 혹은 은닉 상태 벡터(hidden state vector)라고 한다.

## Gradient Vanishing & Gradient Exploding problem

## LSTM (Long-Short Term Memory)

## GRU (Gate Recurrent Unit)