# Minimal character-level language model with a Vanilla Recurrent Neural Network, in Python/NumPy

*Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License*

In [1]:
import numpy as np

**data I/O**

In [2]:
data = open('input.txt', 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d charactors, %d unique.' % (data_size, vocab_size))

data has 69 charactors, 25 unique.


In [3]:
data

"Hello, I'm Sean Lee.\nI am saying helo to you guys.\nNice to meet you!\n"

In [10]:
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

**hyperparameters**

In [11]:
hidden_size = 100   # size of hidden layer of neurons
seq_length = 25     # number of steps to unroll the RNN for
learning_rate = 1e-1

**model parameters**

In [12]:
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01   # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01   # hidden to output
bh = np.zeros((hidden_size, 1))   # hidden bias
by = np.zeros((vocab_size, 1))    # output bias

**loss function**

In [19]:
def lossFun(inputs, target, hprev):
    """
    inputs, target, are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    # forward pass
    for t in xrange(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))   # encode in 1-of-k representation
        xs[t][inuput[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)   # hidden state
        ys[t] = np.dot(Why, hs[t]) + by   # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))   # probabilities for next chars
        loss += -np.log(ps[t][targets[t], 0])   # softmax (cross-entropy loss)
        
    # backward pass: compute gradients going backwards
    for t in reversed(xrange(len(inputs))):
        dy = np.copy(ps[t])
        dy[traget[t]] -= 1
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext
        dhraw = (1 - hs[t] * hs[t]) * dh
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t].T)
        dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)   # clip to mitigate exploding gradients
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(input) - 1]