In [1]:
import numpy as np

In [2]:
text=open('shakespeare.txt','r').read()
chars=list(set(text))
text_size, vocab_size = len(text), len(chars)
print(f"\n Text Size : {text_size} | Vocab Size : {vocab_size}")


 Text Size : 99984 | Vocab Size : 62


In [3]:
print(chars)

['p', 'o', 'R', 'M', 'r', 'g', ',', 'u', 'b', 'P', 'X', 'B', 't', 'j', 'J', 'W', ';', 'v', 'q', 'G', '.', 'Z', 'F', 'C', 'f', 'w', 'L', 'N', 'm', 'Q', "'", 'x', 'T', 'h', 'y', ' ', 'U', 'z', 'H', 's', 'd', 'i', 'O', 'S', 'Y', 'a', 'k', 'E', 'K', 'V', '?', ':', 'e', 'D', '\n', 'I', '!', 'l', '-', 'c', 'A', 'n']


In [4]:
#Tokenize
char_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_char = {i:ch for i,ch in enumerate(chars)}

In [25]:
#Hyperparameters
hidden_size = 72
seq_length = 24
learning_rate = 0.05
beta1 = 0.999
beta2 = 0.9999

In [6]:
Wi2h = np.random.randn(hidden_size, vocab_size)*0.01 #input to hidden embedding
Wh2h = np.random.randn(hidden_size, hidden_size)*0.01 #hidden to hidden
Wh2o = np.random.randn(vocab_size, hidden_size)*0.01 #hidden to output
hb = np.zeros((hidden_size,1)) #hidden bias
ob = np.zeros((vocab_size,1)) #output bias

In [7]:
Wi2h.shape, Wh2h.shape, Wh2o.shape, hb.shape, ob.shape

((72, 62), (72, 72), (62, 72), (72, 1), (62, 1))

In [8]:
def gradient_descent_single(inputs, targets, hactive):
    xenc,henc,yprobs,probs={},{},{},{}
    henc[-1]=np.copy(hactive)
    loss=0
    #forward pass a single example
    for i in range(len(inputs)):
        xenc[i] = np.zeros((vocab_size,1))
        xenc[i][inputs[i]] = 1 #one hot encoding of input sequence
        henc[i] = np.tanh((Wi2h @ xenc[i]) + (Wh2h @ henc[i-1]) + hb) #hidden state update
        yprobs[i] = (Wh2o @ henc[i]) + ob
        #yprobs[i] -= np.max(yprobs[i])#unnormalized logprobs for next character
        probs[i] = np.exp(yprobs[i]) / np.sum(np.exp(yprobs[i])) #softmax normalization
        loss += -np.log(probs[i][targets[i],0]) #-log liklihood (cross entropy)
    
    #backward pass for current example
    dWi2h,dWh2h,dWh2o,dhb,dob = np.zeros_like(Wi2h),np.zeros_like(Wh2h),np.zeros_like(Wh2o),np.zeros_like(hb),np.zeros_like(ob)
    dhactive = np.zeros_like(henc[0])
    for i in reversed(range(len(inputs))):
        dyps = np.copy(probs[i]) #backward into logits
        dyps[targets[i]] -= 1
        dWh2o += dyps @ henc[i].T
        dob += dyps
        dhb += (1-henc[i]**2)*(Wh2o.T @ dyps + dhactive)
        dWi2h += ((1-henc[i]**2)*(Wh2o.T @ dyps + dhactive)) @ xenc[i].T
        dWh2h += ((1-henc[i]**2)*(Wh2o.T @ dyps + dhactive)) @ henc[i-1].T
        dhactive = Wh2h.T @ ((1-henc[i]**2)*(Wh2o.T @ dyps + dhactive))
    for grads in [dWi2h,dWh2h,dWh2o,dhb,dob]:
        np.clip(grads, -5,5, out=grads) #clip gradients from exploding or vanishing
    return loss,dWi2h,dWh2h,dWh2o,dhb,dob,henc[len(inputs)-1]

In [9]:
def sample(hidden, input_seed, n):
    ip = np.zeros((vocab_size,1))
    ip[input_seed] = 1
    ops = []
    for i in range(n):
        hidden = np.tanh(Wi2h @ ip + Wh2h @ hidden + hb) #forward pass
        y = ((Wh2o @ hidden) + ob) #outputs
        prob = np.exp(y)/np.sum(np.exp(y)) #softmax logits
        ix = np.random.choice(range(vocab_size),p=prob.ravel()) #choose random from range
        ip = np.zeros((vocab_size,1))
        ip[ix] = 1
        ops.append(ix)
    return ops  

In [10]:
n, offset = 0, 0
#Memory variables for Adam optimizer
#Momentum variables
vWi2h, vWh2h, vWh2o = np.zeros_like(Wi2h), np.zeros_like(Wh2h), np.zeros_like(Wh2o)
vhb, vob = np.zeros_like(hb), np.zeros_like(ob)
#RMSProp variables
sWi2h, sWh2h, sWh2o = np.zeros_like(Wi2h), np.zeros_like(Wh2h), np.zeros_like(Wh2o)
shb, sob = np.zeros_like(hb), np.zeros_like(ob)

smooth_loss = -np.log(1.0/vocab_size)*seq_length #smoothen loss by a constant

In [26]:
for epoch in range(1,200001):
    #grab chunks from inputs
    if offset+seq_length+1>=len(text) or not n:
        #print("came here")
        hactive = np.zeros((hidden_size,1)) #reset rnn preactivations from memory
        offset=0
    inputs = [char_to_int[ch] for ch in text[offset:offset+seq_length]]
    targets = [char_to_int[ch] for ch in text[offset+1:offset+seq_length+1]]
    #forward pass + get grads for a single sequence
    loss,dWi2h,dWh2h,dWh2o,dhb,dob,hactive = gradient_descent_single(inputs, targets, hactive)
    smooth_loss = smooth_loss * 0.999 + loss *0.001 #weighted smoothing
    if not n%100:
        print(f'iteration : {n} --> loss : {smooth_loss}')
    
    #Adam updation
    #Momentum variables
    vWi2h = (beta1 * vWi2h + (1-beta1) * dWi2h)
    vWh2h = (beta1 * vWh2h + (1-beta1) * dWh2h)
    vWh2o = (beta1 * vWh2o + (1-beta1) * dWh2o)
    vhb = (beta1 * vhb + (1-beta1) * dhb)
    vob = (beta1 * vob + (1-beta1) * dob)
    
    #RMSprop variables
    sWi2h = (beta2 * sWi2h + (1-beta2) * np.square(dWi2h))
    sWh2h = (beta2 * sWh2h + (1-beta2) * np.square(dWh2h))
    sWh2o = (beta2 * sWh2o + (1-beta2) * np.square(dWh2o))
    shb = (beta2 * shb + (1-beta2) * np.square(dhb))
    sob = (beta2 * sob + (1-beta2) * np.square(dob))
    
    #Parameter updation. Momentum bias correction -> 1-beta1**epoch | RMS bias correction -> 1-beta2**epoch
    for param,momentum,rms in zip([Wi2h, Wh2h, Wh2o, hb, ob],[vWi2h, vWh2h, vWh2o, vhb, vob],[sWi2h, sWh2h, sWh2o, shb, sob]):
        param -= learning_rate * ((momentum / (1-beta1**epoch))/ np.sqrt((rms / (1-beta2**epoch)) + 1e-8)) / len(inputs)
    offset += seq_length
    n += 1

iteration : 454600 --> loss : 40.475732330547125
iteration : 454700 --> loss : 40.91389950079509
iteration : 454800 --> loss : 41.31420043939537
iteration : 454900 --> loss : 41.77212723928842
iteration : 455000 --> loss : 42.098306323336246
iteration : 455100 --> loss : 42.52059847508115
iteration : 455200 --> loss : 42.53418620196879
iteration : 455300 --> loss : 42.652596270595616
iteration : 455400 --> loss : 42.767096275375565
iteration : 455500 --> loss : 42.88864495609789
iteration : 455600 --> loss : 42.957855576733515
iteration : 455700 --> loss : 42.89880276994455
iteration : 455800 --> loss : 42.990953849893366
iteration : 455900 --> loss : 43.07389689893254
iteration : 456000 --> loss : 43.28638225732019
iteration : 456100 --> loss : 43.202453307473746
iteration : 456200 --> loss : 43.1651045637224
iteration : 456300 --> loss : 43.12445002667969
iteration : 456400 --> loss : 43.18978693431721
iteration : 456500 --> loss : 43.044030123553114
iteration : 456600 --> loss : 42.

KeyboardInterrupt: 

In [27]:
#sample random batches from trained model
sample_ix = sample(hactive, inputs[0], 500)
txt = ''.join(int_to_char[ix] for ix in sample_ix)
print(txt)

dirs, Fore orstiom the know not me thou aran thee me like in a sispomian a stort. Come ever and an it know
Live no commore,
If peresemes, now it canfesing with my in that wife;
What my putices of lostly you, you shichosiging shouth it,
To in the hath joviss not with your safe to troze
The nobleting came, I'll make we son, were could like
Thou man spear.

BORTIV:
He!

Tis:
As thruch me when thrinct's day
Tas,
That me.

ARWELUS:
So the hath a last?

SIAGTAR:
Sir; geleing':
Which of suigs of me.

C
