In [1]:
import numpy as np

In [2]:
text=open('shakespeare.txt','r').read()
chars=list(set(text))
text_size, vocab_size = len(text), len(chars)
print(f"\n Text Size : {text_size} | Vocab Size : {vocab_size}")


 Text Size : 99984 | Vocab Size : 62


In [3]:
print(chars)

['h', '!', 'S', 'D', 'P', 'O', '-', '\n', 'B', 'm', 'e', '.', 'i', 'o', 'r', 'C', 'y', 'T', 'I', 'j', 'z', 'l', 'd', 'H', 'n', ' ', ':', 'u', 'A', 'p', 'v', 'Z', 'U', "'", 'M', 'V', 'k', ',', 'x', 'g', 'E', 'F', 'W', 'c', 's', 'X', 'K', 'J', 't', 'w', 'L', 'R', 'a', 'b', 'G', '?', 'N', 'f', 'q', ';', 'Y', 'Q']


In [4]:
#Tokenize
char_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_char = {i:ch for i,ch in enumerate(chars)}

In [5]:
#Hyperparameters
hidden_size = 256
seq_length = 25
learning_rate = 0.05
beta1 = 0.9
beta2 = 0.999

In [6]:
Wi2h = np.random.randn(hidden_size, vocab_size)*0.01 #input to hidden embedding
Wh2h = np.random.randn(hidden_size, hidden_size)*0.01 #hidden to hidden
Wh2o = np.random.randn(vocab_size, hidden_size)*0.01 #hidden to output
hb = np.zeros((hidden_size,1)) #hidden bias
ob = np.zeros((vocab_size,1)) #output bias

In [7]:
Wi2h.shape, Wh2h.shape, Wh2o.shape, hb.shape, ob.shape

((256, 62), (256, 256), (62, 256), (256, 1), (62, 1))

In [8]:
def gradient_descent_single(inputs, targets, hactive):
    xenc,henc,yprobs,probs={},{},{},{}
    henc[-1]=np.copy(hactive)
    loss=0
    #forward pass a single example
    for i in range(len(inputs)):
        xenc[i] = np.zeros((vocab_size,1))
        xenc[i][inputs[i]] = 1 #one hot encoding of input sequence
        henc[i] = np.tanh((Wi2h @ xenc[i]) + (Wh2h @ henc[i-1]) + hb) #hidden state update
        yprobs[i] = (Wh2o @ henc[i]) + ob
        yprobs[i] -= np.max(yprobs[i])#unnormalized logprobs for next character
        probs[i] = np.exp(yprobs[i]) / np.sum(np.exp(yprobs[i])) #softmax normalization
        loss += -np.log(probs[i][targets[i],0]+ 1e-8) #-log liklihood (cross entropy)
    
    #backward pass for current example
    dWi2h,dWh2h,dWh2o,dhb,dob = np.zeros_like(Wi2h),np.zeros_like(Wh2h),np.zeros_like(Wh2o),np.zeros_like(hb),np.zeros_like(ob)
    dhactive = np.zeros_like(henc[0])
    for i in reversed(range(len(inputs))):
        dyps = np.copy(probs[i]) #backward into logits
        dyps[targets[i]] -= 1
        dWh2o += dyps @ henc[i].T
        dob += dyps
        dhb += (1-henc[i]**2)*(Wh2o.T @ dyps + dhactive)
        dWi2h += ((1-henc[i]**2)*(Wh2o.T @ dyps + dhactive)) @ xenc[i].T
        dWh2h += ((1-henc[i]**2)*(Wh2o.T @ dyps + dhactive)) @ henc[i-1].T
        dhactive = Wh2h.T @ ((1-henc[i]**2)*(Wh2o.T @ dyps + dhactive))
    for grads in [dWi2h,dWh2h,dWh2o,dhb,dob]:
        np.clip(grads, -5,10, out=grads) #clip gradients from exploding or vanishing
    return loss,dWi2h,dWh2h,dWh2o,dhb,dob,henc[len(inputs)-1]

In [9]:
def sample(hidden, input_seed, n):
    ip = np.zeros((vocab_size,1))
    ip[input_seed] = 1
    ops = []
    for i in range(n):
        hidden = np.tanh(Wi2h @ ip + Wh2h @ hidden + hb) #forward pass
        y = ((Wh2o @ hidden) + ob) #outputs
        prob = np.exp(y)/np.sum(np.exp(y)) #softmax logits
        ix = np.random.choice(range(vocab_size),p=prob.ravel()) #choose random from range
        ip = np.zeros((vocab_size,1))
        ip[ix] = 1
        ops.append(ix)
    return ops  

In [10]:
n, offset = 0, 0
#Memory variables for Adam optimizer
#Momentum variables
vWi2h, vWh2h, vWh2o = np.zeros_like(Wi2h), np.zeros_like(Wh2h), np.zeros_like(Wh2o)
vhb, vob = np.zeros_like(hb), np.zeros_like(ob)
#RMSProp variables
sWi2h, sWh2h, sWh2o = np.zeros_like(Wi2h), np.zeros_like(Wh2h), np.zeros_like(Wh2o)
shb, sob = np.zeros_like(hb), np.zeros_like(ob)

smooth_loss = -np.log(1.0/vocab_size)*seq_length #smoothen loss by a constant

In [11]:
for epoch in range(1,200000):
    #grab chunks from inputs
    if p+seq_length+1>=len(text) or not n:
        #print("came here")
        hactive = np.zeros((hidden_size,1)) #reset rnn preactivations from memory
        offset=0
    inputs = [char_to_int[ch] for ch in text[offset:offset+seq_length]]
    targets = [char_to_int[ch] for ch in text[offset+1:offset+seq_length+1]]
    #forward pass + get grads for a single sequence
    loss,dWi2h,dWh2h,dWh2o,dhb,dob,hactive = gradient_descent_single(inputs, targets, hactive)
    smooth_loss = smooth_loss * 0.999 + loss *0.001 #weighted smoothing regularization
    if not n%100:
        print(f'iteration : {n} --> loss : {smooth_loss}')
    
    #Adam updation
    #Momentum variables
    vWi2h = (beta1 * vWi2h + (1-beta1) * dWi2h)
    vWh2h = (beta1 * vWh2h + (1-beta1) * dWh2h)
    vWh2o = (beta1 * vWh2o + (1-beta1) * dWh2o)
    vhb = (beta1 * vhb + (1-beta1) * dhb)
    vob = (beta1 * vob + (1-beta1) * dob)
    
    #RMSprop variables
    sWi2h = (beta2 * sWi2h + (1-beta2) * np.square(dWi2h))
    sWh2h = (beta2 * sWh2h + (1-beta2) * np.square(dWh2h))
    sWh2o = (beta2 * sWh2o + (1-beta2) * np.square(dWh2o))
    shb = (beta2 * shb + (1-beta2) * np.square(dhb))
    sob = (beta2 * sob + (1-beta2) * np.square(dob))
    
    #Parameter updation. Momentum bias correction -> 1-beta1**epoch | RMS bias correction -> 1-beta2**epoch
    for param,momentum,rms in zip([Wi2h, Wh2h, Wh2o, hb, ob],[vWi2h, vWh2h, vWh2o, vhb, vob],[sWi2h, sWh2h, sWh2o, shb, sob]):
        param -= learning_rate * ((momentum / (1-beta1**epoch))/ np.sqrt((rms / (1-beta2**epoch)) + 1e-8)) / len(inputs)
    offset += seq_length
    n += 1

came here
iteration : 0 --> loss : 103.17836262267855
iteration : 100 --> loss : 101.43098364251577
iteration : 200 --> loss : 99.9404624445583
iteration : 300 --> loss : 98.55384841341086
iteration : 400 --> loss : 96.76894522591789
iteration : 500 --> loss : 94.87453957201677
iteration : 600 --> loss : 92.83431009146545
iteration : 700 --> loss : 92.92769838428045
iteration : 800 --> loss : 91.50639870121323
iteration : 900 --> loss : 89.89616999721868
iteration : 1000 --> loss : 88.10501034958621
iteration : 1100 --> loss : 86.52882591603559
iteration : 1200 --> loss : 85.04140751685932
iteration : 1300 --> loss : 83.33139442566855
iteration : 1400 --> loss : 81.80019748537376
iteration : 1500 --> loss : 80.41448832324282
iteration : 1600 --> loss : 79.16711571122421
iteration : 1700 --> loss : 77.72566080773898
iteration : 1800 --> loss : 76.73757775232463
iteration : 1900 --> loss : 75.73769042865649
iteration : 2000 --> loss : 74.86062009910349
iteration : 2100 --> loss : 74.0044

KeyboardInterrupt: 

In [14]:
#sample random batches from trained model
sample_ix = sample(hactive, inputs[0], 500)
txt = ''.join(int_to_char[ix] for ix in sample_ix)
print(txt)

 ho can the caul cas someon you thiNd priS
of of cortter.

CUBUNCHAVID:
Whe to tem vorosty, cost givenaereeetheithft has heat maste Paghin sacateint my fale, theesst mefose, them to them mour:
Cust there TAoo:--otoattet tott theirce
To I pabor youch Do brozetct bonser;
Interteateems,
But there sae with the tHuor his reats cachamme mact fore? sees sales Af caste that trire,
Hac, in nuter to tro are that theerciam
A Stricheare havitt?

CYDAVBATBER:
The so a wass:
Waveethus thy
Mearm, the shanter! 
