In [5]:
import numpy as np

data=open('data/input.txt').read()
chars=list(set(data))
data_size,vocab_size=len(data),len(chars)
print('Data has {} data-characters and {} unique characters'.format(data_size,vocab_size))

char_to_idx={ch:i for i,ch in enumerate(chars)}
ix_to_char={i:ch for i,ch in enumerate(chars)}

#hyperparameters
hidden_size=100 # size of hidden layer of neurons
seq_length=25 # number of steps to unroll the RNN
learning_rate=1e-1

# update the hidden state
# self.h=np.tanh(np.dot(self.W_hh,self.h)+np.dot(self.W_xh,x))
# compute the new output vector
# y=np.dot(self.W_hy,self.h)

Wxh=np.random.randn(hidden_size,vocab_size) #input to hidden
Whh=np.random.randn(hidden_size,hidden_size) #hidden to hidden
Why=np.random.randn(vocab_size,hidden_size) #hidden to output
bh=np.zeros((hidden_size,1)) #hidden bias
by=np.zeros((vocab_size,1)) #output bias

#compute loss, derivative
#cross-entropy error and sum of squared errors in backpropagation[since output is linear]

def lossFunc(inputs,targets,hprev):
    xs,hs,ys,ps={},{},{},{}
    
    hs[-1]=np.copy(hprev)
    loss=0
    
    # forward pass for each training data point
    for t in range(len(inputs)):
        xs[t]=np.zeros((vocab_size,1))
        xs[t][inputs[t]]=1
        
        #hidden state using previous hidden state hs[t-1]
        hs[t]=np.tanh(np.dot(Wxh,xs[t])+np.dot(Whh,hs[t-1])+bh)
        #unnormalized log probabilities for next chars
        ys[t]=np.dot(Why,hs[t])+by
        #probabilities for next chars[softmax]
        ps[t]=np.exp(ys[t])/np.sum(np.exp(ys[t]))
        #cross entropy loss        
        loss+=-np.log(ps[t][targets[t],0]) 
        
    # backward pass : compute gradients going backwards
    dWxh,dWhh,dWhy=np.zeros_like(Wxh),np.zeros_like(Whh),np.zeros_like(Why)
    dbh,dby=np.zeros_like(bh),np.zeros_like(by)
    
    dhnext=np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))):
        #compute derivative of error w.r.t the output probs
        # dE/dy[j]=y[j]-t[j]
        dy=np.copy(ps[t])
        dy[targets[t]] -= 1 
        
        #output does not use activation function
        #therefore derivative of error with regard to the weight between hidden and output layer:
        #dE/dy[j]*dy[j]/dWhy[j,k]=dE/dy[j]*h[k]
        dWhy+=np.dot(dy,hs[t].T)
        dby += dy
        
        #backprop into h
        #derivative of H comes from output layer y and from next hidden layer H[t+1] 
        dh=np.dot(dWhy.T,dy)+dhnext
        
        #backprop through tanh nonlinearity
        #dtanhx/dx=1-tanh(x)*tanh(x)
        dhraw=(1-hs[t]*hs[t])*dh
        dbh+=dhraw
        
        #derivative of error with regard to the weight between input and hidden layer
        dWxh += np.dot(dhraw,xs[t].T)
        dWhh += np.dot(dhraw,hs[t-1].T)
        
        #derivative of error with regard to H(t+1)
        dhnext=np.dot(Whh.T,dhraw)
        
    for dparam in [dWxh,dWhh,dWhy,dbh,dby]:
        np.clip(dparam,5,-5,out=dparam)
        
    return loss,dWxh,dWhh,dWhy,dbh,dby,hs[len(inputs)-1]

def sample(h,seed_ix,n):

    # sample a seq of integers from the model
    # h -> memory state, seed_ix -> seed letter for 1st time step
    
    #one-hot
    x=np.zeros((vocab_size,1))
    x[seed_ix]=1
    
    ixes=[]
    for t in range(n):
        h=np.tanh(np.dot(Wxh,x)+np.dot(Whh,h)+bh)
        y=np.dot(Why,h)+by
        p=np.exp(y)/np.sum(np.exp(y))
        #sample according to probability distribution
        ix=np.random.choice(range(vocab_size),p=p.ravel())
        
        #update input x
        x=np.zeros((vocab_size,1))
        x[ix]=1
        
        ixes.append(ix)
        
    return ixes

#iterator
n=0
#data pointer
p=0

mWxh,mWhh,mWhy=np.zeros_like(Wxh),np.zeros_like(Whh),np.zeros_like(Why)
mbh,mby=np.zeros_like(bh),np.zeros_like(by) # meemory variables for adagrad
smooth_loss=-np.log(1.0/vocab_size)*seq_length

while True:
    # prepare inputs (left -> right in steps of seq_length)
    if p+seq_length+1>=len(data) or n==0:
        #reset RNN memory
        hprev=np.zeros((hidden_size,1)) # hidden state
        p=0
        
        inputs=[char_to_idx[ch] for ch in data[p:p+seq_length]]
        targets=[char_to_idx[ch] for ch in data[p+1:p+seq_length+1]]
        
        #sample from the model
        if n%100==0:
            sample_ix=sample(hprev,inputs[0],200)
            txt=''.join(ix_to_char[ix] for ix in sample_ix)
            print('---------sample----------')
            print('---------\n {} \n--------'.format(txt))
            
        loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFunc(inputs,targets,hprev)
        
        #autograd
        smooth_loss=smooth_loss*0.999 +loss*0.001
        if n%2==0:
            print('Iter - {}, Loss - {}'.format(n,smooth_loss))
            
        #memory is accumulated after each iteration
        for param, dparam, mem in zip([Wxh,Whh,Why,bh,by],[dWxh,dWhh,dWhy,dbh,dby],[mWxh,mWhh,mWhy,mbh,mby]):
            mem+=param*dparam
            # learning rate adjusted by mem
            param+=-learning_rate*dparam/np.sqrt(mem+1e-8)
            
        p+=seq_length
        n+=1

Data has 1115393 data-characters and 65 unique characters
---------sample----------
---------
 $vmX
bOJM;PGAQF&PyPPS$RF,Jp$FnztfDDHZlUyF FxyGRK,SSLGRGqeE'b'xTpgVLj3ybmTRfVETfEn'ImuTtQEGOR$xfrF'kqTMMa&TxdELDMLjxRISnAKRgap
NTZE?lPNrdxwLyQiXGL SrR:VxordvHEcf.:Xm?xlNXqHcdWJpsJVxo'cqep3mZdlP'I3xxFlz 
--------
Iter - 0, Loss - 104.8150415510979




KeyboardInterrupt: 