<h1>Char-Sequence RNN</h1>

Taken from https://gist.github.com/karpathy/d4dee566867f8291f086

In [1]:
import numpy as np

In [2]:
data = open('test.txt', 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(data, chars, data_size, vocab_size, sep="\n")

this is a test sentence for the sequence generation RNN


['c', 'R', 'e', 'o', 'N', 's', 'a', 'q', 'u', 'g', '\n', 't', 'r', 'f', 'h', 'i', 'n', ' ']
57
18


In [3]:
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i, ch in enumerate(chars)}
print(char_to_ix, ix_to_char, sep="\n")

{'c': 0, 'R': 1, 'e': 2, 'o': 3, 'N': 4, 's': 5, 'a': 6, 'q': 7, 'u': 8, 'g': 9, '\n': 10, 't': 11, 'r': 12, 'f': 13, 'h': 14, 'i': 15, 'n': 16, ' ': 17}
{0: 'c', 1: 'R', 2: 'e', 3: 'o', 4: 'N', 5: 's', 6: 'a', 7: 'q', 8: 'u', 9: 'g', 10: '\n', 11: 't', 12: 'r', 13: 'f', 14: 'h', 15: 'i', 16: 'n', 17: ' '}


In [4]:
# hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [5]:
# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # 100x28
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # 100x100
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # 28x100
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))
print(Wxh[:2,], Whh[:2,], Why[:2,], bh[:2], by[:2], sep="\n\n")

[[ 1.16216257e-04  5.20287798e-05 -6.34111563e-03  1.27913264e-03
  -8.42735376e-03  2.41981070e-02  1.71583785e-03 -3.50337489e-03
  -1.75061287e-03 -4.20895098e-03  3.74193687e-04 -6.03903281e-03
  -2.69658092e-03 -3.66905793e-04  1.07542928e-02  1.47778949e-02
   1.05777283e-02 -6.18650724e-03]
 [ 1.63251050e-02 -1.50234832e-02  8.28528091e-03  9.13632378e-03
   1.81584744e-02  6.33851867e-03  1.76542327e-03  1.42450387e-02
  -9.74520002e-03  2.90698593e-03  1.32273842e-02 -7.31530130e-03
  -1.30914532e-02  7.45783958e-03  7.05801828e-03  7.49372112e-03
  -7.35677305e-03  2.33519441e-02]]

[[ 0.00510032  0.01343263  0.01333247  0.00411951 -0.00256134 -0.01756038
   0.00923666 -0.00029012 -0.01030918  0.00785071 -0.00720719  0.00432183
   0.016622    0.00944487 -0.00092601  0.0031886  -0.00171729  0.00197577
   0.00083118  0.0033164  -0.00383565  0.00415565 -0.01085973 -0.00190067
  -0.00415697  0.00759467  0.00571631  0.0078816  -0.01478265  0.00418319
  -0.00029124  0.00508052 -0.0

In [6]:
# loss function
def loss_func(inputs, targets, hprev):
    
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    # forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1 # one-hot encoding
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # inputs and prev. state for forward pass
        ys[t] = np.dot(Why, hs[t]) + by # output layer pre-softmax
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # softmax
        loss += -np.log(ps[t][targets[t], 0]) # cross-entropy loss with target
    
    #backward pass
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y: dL/df = -log(f) - 1, -log(f) is in ps[t] - Analytic Gradient
        dWhy += np.dot(dy, hs[t].T) # output error * hidden activations
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h (though topology and time), summation dk*wkj
        dhraw = (1 - hs[t]*hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # stop exploding gradients
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [7]:
# sample a sequence of n integers
def sample(h, seed_ix, n): # h is memory state, seed_ix is seed letter for first time step
    
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1 # one-hot encoding
    ixes = []
    
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y)) # forward pass done
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1;
        ixes.append(ix)
        
    return ixes

In [8]:
n, p = 0, 0
mWxh, mWhh, mWhy =  np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) # memory vars for AdaGrad
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
smooth_loss = -np.log(1.0/vocab_size) * seq_length # loss at iteration 0

while n < 30001:
    if p+seq_length+1 >= len(data) or n == 0: # sliding window of size seq_length: sweeping from left to right 
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        p = 0 
    inputs = [char_to_ix[ch] for ch in data[p : p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1 : p+seq_length+1]]
    
    if n % 500 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print("----\n {0} \n----".format(txt))
        
    # forward seq_length chars through the RNN and fetch gradients
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = loss_func(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0:
        print("Iteration {0}:\tloss: {1}".format(n, smooth_loss))
        
    # AdaGrad for parameter update
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], 
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
        
    p += seq_length # move the data point
    n += 1
        

----
 otNuqcr
qcgfcg fqcanuahugesuhRegcn
sneNoRcgsRanog
iirs
u sonhoahReuuNnNrutnti e
rieNratuuRssneicheofiiqqhoe fqf
rfgoa
fsteotuRoqsfR
ieitaceccffocsofR
 httacNausrffocitgeioRRcr RfNRae agranrfRrtuhttcNa 
----
Iteration 0:	loss: 72.25929756886642
Iteration 100:	loss: 71.88596943285263
Iteration 200:	loss: 68.67430557621368
Iteration 300:	loss: 63.52956640390451
Iteration 400:	loss: 58.15387411103397
----
 his isentenct for the sequence generationce for the sequeece gengenerationce sentence foneq foncence for the sequence sequence generationse so fontats n the gequence generationsaesense for the sequenc 
----
Iteration 500:	loss: 52.9408630927503
Iteration 600:	loss: 48.084847637686906
Iteration 700:	loss: 43.632835977979376
Iteration 800:	loss: 39.5724752519512
Iteration 900:	loss: 35.87926841815128
----
 his is a test sentence for the sequence senuence generahiontence for the sequence generationce for the sequence ge gence generationeence for the sequence sequence generationce for 

Iteration 9700:	loss: 0.04306135390152063
Iteration 9800:	loss: 0.04203918110970889
Iteration 9900:	loss: 0.041079480493332696
----
 his is a test sentence for the sequence generationttnce for the sequence generationtence for the sequence generationce for the sequence generationce for the sequence generationtence for the sequence g 
----
Iteration 10000:	loss: 0.04017707533810542
Iteration 10100:	loss: 0.039327255560133445
Iteration 10200:	loss: 0.038525734408608764
Iteration 10300:	loss: 0.037768609228471764
Iteration 10400:	loss: 0.03705232590212199
----
 his is a test sentence for the sequence generationce for the sequence generationce for the sequence generationce for the sequence generationce for the sequence generationtence for the sequence generat 
----
Iteration 10500:	loss: 0.03637364662507275
Iteration 10600:	loss: 0.0357296207029449
Iteration 10700:	loss: 0.035117558086669186
Iteration 10800:	loss: 0.034535005389481385
Iteration 10900:	loss: 0.03397972415349303
----
 his is 

Iteration 19300:	loss: 0.016191526355308128
Iteration 19400:	loss: 0.01609307788712188
----
 his is a test sentence for the sequence generationce for the sequence generationtence for the sequence generationce for the sequence generationce for the sequence generationce for the sequence generat 
----
Iteration 19500:	loss: 0.015995869612116492
Iteration 19600:	loss: 0.01589988052704682
Iteration 19700:	loss: 0.015805088749351683
Iteration 19800:	loss: 0.015711471875704066
Iteration 19900:	loss: 0.015619007268784795
----
 his is a test sentence for the sequence generationce for the sequence generationsence for the sequence generationce for the sequence generationce for the sequence generationce for the sequence generat 
----
Iteration 20000:	loss: 0.01552767228169365
Iteration 20100:	loss: 0.015437444430009098
Iteration 20200:	loss: 0.015348301520957288
Iteration 20300:	loss: 0.015260221748069678
Iteration 20400:	loss: 0.015173183758459285
----
 his is a test sentence for the sequence gen

Iteration 28800:	loss: 0.010260171413313323
Iteration 28900:	loss: 0.01022065771798347
----
 his is a test sentence for the sequence generationce for the sequence generationce for the sequence generationce for the sequence generationsence for the sequence generationtence for the sequence gene 
----
Iteration 29000:	loss: 0.010181447626850946
Iteration 29100:	loss: 0.010142537791195365
Iteration 29200:	loss: 0.010103924905849427
Iteration 29300:	loss: 0.01006560570839398
Iteration 29400:	loss: 0.010027576978394306
----
 his is a test sentence for the sequence generationce for the sequence generationce for the sequence generationsence for the sequence generationce for the sequence generationce for the sequence generat 
----
Iteration 29500:	loss: 0.009989835536675071
Iteration 29600:	loss: 0.009952378244631339
Iteration 29700:	loss: 0.00991520200357318
Iteration 29800:	loss: 0.00987830375410136
Iteration 29900:	loss: 0.009841680475512073
----
 his is a test sentence for the sequence gene