In [150]:
import numpy as np

In [151]:
def softmax(x):
    xt = np.exp(x - np.max(x))
    return xt / np.sum(xt)

In [152]:
class RNN:
    def __init__(self,word_dim,hidden_dim=50,bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
    
    def forward_propagation(self,x):
        
        T = len(x)
        # forward propagtion 하는 동안 모든 hidden state를 나중에 사용하기 위해(bp)
        # s 안에 저장해둔다. 0으로 set해둔 initial hidden element도 넣어준다
        s = np.zeros((T+1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim) # initial hidden
        
        # 각 time step의 output들도 o에 저장. (디코딩에 사용?)
        o = np.zeros((T, self.word_dim))
        
        # 각 time step 마다
        
        for t in np.arange(T):

            s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o,s]
    
    def predict(self, x):
        # Perform forward propagation and return index of the highest score
        o, s = self.forward_propagation(x)
        return np.argmax(o, axis=1)
    
    # cross-entropy
    def calculate_total_loss(self, x, y):
        L = 0
        # For each sentence...
        for i in np.arange(len(y)):
            o, s = self.forward_propagation(x[i])
            # We only care about our prediction of the "correct" words
            correct_word_predictions = o[np.arange(len(y[i])), y[i]]
            # Add to the loss based on how off we were
            L += -1 * np.sum(np.log(correct_word_predictions))
        return L

    def calculate_loss(self, x, y):
        # Divide the total loss by the number of training examples
        N = np.sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x,y)/N
    
    
    def bptt(self, x, y):
        T = len(y)
        # Perform forward propagation
        o, s = self.forward_propagation(x)
        # We accumulate the gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        delta_o = o
        delta_o[np.arange(len(y)), y] -= 1.
        # For each output backwards...
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T)
            # Initial delta calculation
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                dLdW += np.outer(delta_t, s[bptt_step-1])
                dLdU[:,x[bptt_step]] += delta_t
                # Update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
        return [dLdU, dLdV, dLdW]

In [154]:
y = np.random.randint(50,size=(10))

In [155]:
y.shape

(10,)

In [156]:
np.random.seed(10)
model = RNN(50)

In [157]:
o, s = model.forward_propagation(y)

In [159]:
o.shape

(10, 50)

In [167]:
o[1]

array([ 0.02014646,  0.02000861,  0.0204221 ,  0.01958984,  0.02099475,
        0.01972961,  0.02098914,  0.01877979,  0.01899876,  0.02107463,
        0.02068957,  0.01817429,  0.02039984,  0.01772125,  0.01957578,
        0.02162638,  0.01854501,  0.01979072,  0.02083408,  0.01828095,
        0.02007724,  0.01933893,  0.01980086,  0.0211484 ,  0.01992291,
        0.0208342 ,  0.01995083,  0.02078206,  0.01888838,  0.02097798,
        0.01899982,  0.02046854,  0.01958781,  0.01952377,  0.0198024 ,
        0.02001128,  0.01899176,  0.02013325,  0.02047261,  0.02061277,
        0.0202056 ,  0.02150109,  0.02076449,  0.02053908,  0.02025151,
        0.02059732,  0.01990614,  0.01972207,  0.01888614,  0.02092921])

In [168]:
model.predict(y)

array([18, 15, 37, 14, 15, 39, 13, 12, 20,  6])

### Gradient Exploding 

In [180]:
H = 5 # dimension of hidden state
T = 50 # number of time steps
Whh = np.random.randn(H,H)

In [181]:
# forward pass of an RNN(ignoring inputs x)
hs = {}
ss = {}
hs[-1] = np.random.randn(H)
for t in range(T):
    ss[t] = np.dot(Whh,hs[t-1])
    hs[t] = np.maximum(0, ss[t]) # ReLU

In [182]:
# backward pass of the RNN
dhs = {}
dss = {}
dhs[T-1] = np.random.randn(H) # start off the chain with random gradient
for t in reversed(range(T)):
    dss[t] = (hs[t] > 0) * dhs[t] # backprop through the nonlinearity
    dhs[t-1] = np.dot(Whh.T, dss[t]) # backprop into previous hidden state

가중치 중에 1보다 큰 값이 있으면 폭발 <br>
1보다 작은 값이 있으면 vanishing

In [195]:
dhs[40]

array([  5407.36342996,   2652.16715069,    -18.88355691,   4273.93004141,
        13301.88317498])