## Vanilla RNN for poetry generation 
#### RNN contains one hidden layer with 101 neurons

In [528]:
#code written with guidance of tutorial by Andrej Karpathy, https://gist.github.com/karpathy/d4dee566867f8291f086

## Part 1: define functions / initialize variables / set hyperparmaters 

#### 1. Import dataset. Derive text and concatenate into a text file. 

In [529]:
import pandas as pd
import numpy as np

poems_df = pd.read_csv('poems.csv',sep=",",encoding='unicode_escape')


#unfortunately we have to concatenate the whole dataset to get it as close as possible to 1MB (all poems together only .75MB)
#isolate renaissance poems and concatenate contents 
poems = poems_df['content']
all_poems = ""
for p in range(poems.shape[0]):
    all_poems = all_poems + str(poems[p])

characters = list(set(all_poems))

#size of data in total characters and size of vocabulary in unique characters
sz_char = len(characters)
sz_dat = len(all_poems)
sz_mb = sz_dat/1000000
print("data in MB", sz_mb)

data in MB 0.713912


#### 2. Define model hyperparameters 

In [530]:
#hyperparameters 
hidden_neurons = sz_char # the number of neurons in the hidden layer should be roughly the mean between the number of input neurons and the number of output neurons. Both are equal in this case, and 101 
no_steps_unroll = 25 # the number of steps to unroll the RNN for; should be equivalent to the number of input unity 
learning_rate = .1 #experiment with this? steps of .1 between 1 and .001? 

#model parameters 

#randomly initialize weight matrices (sample from gaussian dist with var of 0.01). Should have the dimenions of the two layers being connected 
#'h' is hidden layer
#'i' is input layer
#'o' is output layer
W_ih = np.random.standard_normal(size=(hidden_neurons,sz_char))*.01 #connecting input to hidden layer
W_hh =  np.random.standard_normal(size=(hidden_neurons,hidden_neurons))*.01 #connecting hidden to hidden layers
W_ho = np.random.standard_normal(size=(sz_char,hidden_neurons))*.01 #connecting hidden to output layer

#initialize bias terms (intercept) as 0s for hidden and output layers
b_h = np.zeros((hidden_neurons,1))
b_o = np.zeros((sz_char,1))


#### 3. Define loss function

In [531]:
import copy as copy
import math

#loss function should take as arguments: input, 'targets', as well as the initial states of the hidden layers 
#inputs -> a list of integers where each integer represents a character
#targets -> a list of integers where each integer represents a ____ 
def loss_iterate(inputs,targets,initial_hidden,clip_params):
    #set loss to zero 
    loss = 0
    
    #initialize dictionaries for inputs, hidden states, predictions, targets 
    ins = {} #each instance is a Kx1 vector where k = the length of the vocabulary (sz_char)
    hidds = {} #each instance is a Kx1 vector where k = the length of the vocabulary (sz_char)
    outs = {} #unnormalized log probabilities. 
    probs = {} #normalized probabilities.  
    
    #specify initial hidden state (stored under -1 key). This is so t-1 in first forward pass (index=0) returns this vector
    hidds[-1] = np.copy(initial_hidden)
    
    #forward pass to generate predictions about next character 
    for c in range(len(inputs)):
        #one hot encoding of vector representing input (where 1 is in position of character representation, 0 in all other positions)
        ins[c] = np.zeros((sz_char,1)) 
        ins[c][inputs[c]] = 1
        #new hidden state (weighted inputs + weighted[previous] hidden state + bias, squashed between 0 and 1)
        hidds[c] = np.tanh(np.dot(W_ih,ins[c]) + np.dot(W_hh,hidds[c-1]) + b_h)
        #print(hidds[c].size)
        #generate unnormalized log probabilities 
        outs[c] = np.dot(W_ho, hidds[c]) + b_o
        #print(outs[c].size)
        #normalize these probabilities so that they are non-negative and so that they sum to 1
        probs[c] = np.exp(outs[c]) / np.sum(np.exp(outs[c]))
        #compute cross-entropy loss (softmax). Note that the probability vector is expected to be XXXXXXXX. hence, we specify first column.
        loss = loss + -np.log(probs[c][targets[c],0])
    
    #compute gradients 
    #weight matrices 
    dW_ih = np.zeros((hidden_neurons,sz_char))
    dW_hh = np.zeros((hidden_neurons,hidden_neurons))
    dW_ho = np.zeros((sz_char,hidden_neurons))
    #biases
    d_b_out = np.zeros_like(b_o)
    d_b_hid = np.zeros_like(b_h)
    #next hidden state between iterations
    dhidd_next = np.zeros_like(hidds[0])
    
    #this process is carried out backwards (from end to beginning of forward pass) to compare target predictions of next character with target and use gradients to update params of previous cells 
    for c in reversed(range(len(inputs))):
        #update probs by minusing 1 (this is because of the derivative of negative log of the current probabilities, which would return us to the original unnormalized probability estimates)
        probs_up = np.copy(probs[c])
        probs_up[targets[c]] = probs_up[targets[c]] - 1 
        #determining updates for weights (hidden-output layer) based on probability gradients 
        dW_ho = dW_ho + np.dot(probs_up,hidds[c].transpose())
        #determining updates for bias (from hidden to output layer) 
        d_b_out = d_b_out + probs_up
        #determining updates to hidden layer (POST tanh function)
        hidds_up = np.dot(W_ho.transpose(),probs_up) + dhidd_next 
        #determining updates to hidden layer (PRE tanh function). note derivative of tanh function = 1 - x**2, where x is hidden state in previous iteration
        hidds_up_unnorm = (1-hidds[c]*hidds[c]) * hidds_up
        #determining updates for bias (from hidden to output layer) 
        d_b_hid = d_b_hid + hidds_up_unnorm
        #determining updates for weights (hidden-hidden layer)
        dW_hh = dW_hh + np.dot(hidds_up_unnorm,hidds[c-1].transpose())
        #determining updates for weights (hidden-output layer)
        dW_ih = dW_ih + np.dot(hidds_up_unnorm,ins[c].transpose())
        #next hidden state for next iteration 
        dhidd_next = np.dot(W_hh.transpose(),hidds_up_unnorm)
        
    #reduce size of gradient 
    for params in [dW_ho, dW_hh, dW_ih, d_b_hid, d_b_out]:
        for w in range(params.shape[0]):
            for v in range(params.shape[1]):
                if abs(params[w][v]) > clip_params:
                    if params[w][v] < 0:
                        params[w][v] = -clip_params
                    else:
                        params[w][v] = clip_params
        
    #last hidden state
    last_hidd = hidds[len(inputs)-1]
    
    #return parameter updates, loss, and last hidden state 
    return dW_ih, dW_hh, dW_ho, d_b_hid, d_b_out, loss, last_hidd

        

#### 4. Define function to check gradient 

In [532]:
from random import uniform

#inputs chars, target chars are lists of integers. hid_st is the initial hidden state 
def gradient_check(input_chars,target_chars,hid_st):
    #global -> allow these variables to be modified from within this fxn
    global W_ih, W_hh, W_ho, b_h, b_o
    no_checks = 10
    
    noise = 1e-5
    
    dW_ih, dW_hh, dW_ho, d_b_hid, d_b_out, loss, new_hid_st=loss_iterate(input_chars,target_chars,hid_st,5)

    
    for param,param_up,prints in zip([W_ih, W_hh, W_ho, b_h, b_o], 
                                 [dW_ih, dW_hh, dW_ho, d_b_hid, d_b_out], 
                                 ['W_ih', 'W_hh', 'W_ho', 'b_h', 'b_o']):
        
        update_shape = param_up.shape
        param_shape = param.shape
        
        #check that parameters and updates are the same
        assert (update_shape == param_shape), "ERROR: dimensions of parameters and updates do not match"
        print(prints)
        
        for i in range(no_checks):
            #returns a random floating point number in range 
            rand_int = int(uniform(0,param.size))
            #derives sample from flattened parameter array at random integer 
            rand_samp = param.flat[rand_int]
            
            #replace this value with itself + noise
            param.flat[rand_int] = rand_samp + noise
            
            #calculate new loss after modifying this parameter 
            _,_,_,_,_,loss0,_ = loss_iterate(input_chars,target_chars,hid_st,5)
            
            #replace this value with itself - noise 
            param.flat[rand_int] = rand_samp - noise
            
            #recompute loss 
            _,_,_,_,_,loss1,_ = loss_iterate(input_chars,target_chars,hid_st,5)
            
            #return this value to normal
            param.flat[rand_int] = rand_samp
            
            #print gradient (numerical (actual) and analytical (proposed update))
            gradient_actu = (loss0-loss1) / (2*noise)
            gradient_prop = param_up.flat[rand_int]
            
            #error of analytical gradient (add a small number to denominator to prevent division by zero)
            relative_error_grad = abs(gradient_prop - gradient_actu) / (abs(gradient_actu + gradient_prop)+ np.spacing(1))
            print("actual: %f, analytical: %f, error: %f" %(gradient_actu,gradient_prop,relative_error_grad))
    

## Part 2: Running AMSGrad 

In [533]:
####HYPERPARAMATERS FOR AMSGRAD
learn = .002 #learning rate 
beta1 = .9 #parameter for first moment
beta2 = .999 #parameter for second moment 
epsilon=1e-8 #to prevent dividing by zero

####Array to hold loss every 75 iterations 
Loss_AMSGrad = [0. for i in range(1500)]

In [534]:
#loss at iteration 0 should be determined by size of vocabulary and the sequence size (number of times to unroll RNN)
smooth_loss = -np.log(1/sz_char) * no_steps_unroll 

#AMSGrad again uses the mean (first moment) and varaince (second moment) of the gradients from previous time step; each parameter gets its own mean and var
Grad_means = [0 for i in range(5)]
Grad_vars = [0 for i in range(5)] 
Grad_vars_corr = [0 for i in range(5)] 
#Grad_vars_corr = np.zeros_like(W_ih)


it = 1 #iteration number 
st = 0 #point at which to derive sample from data (data pointer)
count_ind = 0 #counter for storing loss every 75 iterations

#maximum number of iterations 
iter_ceiling = 200000000


converged = False
#while not converged:

while it <= 112500: 
    
    #reset memory of RNN if we are at iteration 0 OR if we have reached end of data strand 
    if it == 1:
        print("ITERATION 1")
        #initial hidden state to use after restarting 
        hidden_st = np.zeros((hidden_neurons,1))
        #start back @ beginning of data
        st = 0
    elif no_steps_unroll + st + 1 >= sz_dat:
        #initial hidden state to use after restarting 
        hidden_st = np.zeros((hidden_neurons,1))
        #start back @ beginning of data 
        st =0                         
    
    #slicing data to match sequence size/number of unrolls 
    input_slice_char = all_poems[st:(st + no_steps_unroll)]
    target_slice_char = all_poems[(st+1):(st+1+no_steps_unroll)]
    
    input_slice_int = np.zeros(len(input_slice_char),dtype=int)
    target_slice_int = np.zeros(len(target_slice_char),dtype=int)

    
    #define these slices in terms of where characters appear in dictionary 
    for c in range(len(input_slice_char)):
        input_slice_int[c] = characters.index(input_slice_char[c])
        target_slice_int[c] = characters.index(target_slice_char[c])
    
    #check gradient estimates (eventually remove break)
    #gradient_check(input_slice_int, target_slice_int, hidden_st)
    #break
    
    #print out samples of model progress every 75 iterations 
    if it % 75 == 0:
        print("POETRY SAMPLE")
        char_vec = np.zeros((sz_char,1))
        #represents seed (start with current character)
        char_vec[input_slice_int[0]] = 1
        char_indices = []
        h = np.copy(hidden_st)
        for ch in range(300):
            #forward pass through hidden layer
            h = np.tanh(np.dot(W_ih,char_vec)+np.dot(W_hh,h)+ b_h) 
            out_unnorm = np.dot(W_ho,h) + b_o #unnormalized log probabilities 
            out_norm = np.exp(out_unnorm) / np.sum(np.exp(out_unnorm)) #normalized log probabilities 
            #selects one character randomly, however taking into account probabilities of each character 
            char_index = np.random.choice(range(sz_char),p=out_norm.ravel())
            #one-hot vector representing selected character (for next iteration through for loop) 
            char_vec = np.zeros((sz_char,1))
            char_vec[char_index] = 1
            char_indices.append(char_index)
        
        #convert indices back to characters 
        sample = []
        for c in range(len(char_indices)):
            sample.append(characters[char_indices[c]])
        #append characters into continuous string 
        sample_txt = ''.join(sample)
        print('----\n %s \n----' % (sample_txt,))
        
        
    
    #calculate gradients from no_steps_unroll characters 
    dW_ih, dW_hh, dW_ho, d_b_hid, d_b_out, loss, hidden_st=loss_iterate(input_slice_int,target_slice_int,hidden_st,5)
    
    #smoothing loss so loss is averaged and not plotted/printed as erratic across iterations
    smooth_loss = (smooth_loss * .999) + (loss * .001)
    
    #print loss every 75 iterations 
    if it % 75 == 0:
        print("iteration: %d, loss: %f" %(it,smooth_loss))
        Loss_AMSGrad[count_ind] = smooth_loss
        count_ind += 1
    
    
    
    #copying weights before update to check for convergence 
    W_ih_prev = copy.deepcopy(W_ih)
    W_hh_prev = copy.deepcopy(W_hh)
    W_ho_prev = copy.deepcopy(W_ho)
    
    #updating weights 
    for param, param_update, index in zip([W_ih, W_hh, W_ho, b_h, b_o],
                                                    [dW_ih,dW_hh,dW_ho,d_b_hid,d_b_out],
                                                   [0,1,2,3,4]):
        

        #first-order exponential decay (momentum beta 1)
        Grad_means[index] = (beta1**(it+1))*(copy.copy(Grad_means[index])) + (1-(beta1**(it+1)))*param_update

        #second-order exponential decta (root mean sq beta 2)
        Grad_vars[index] = beta2*(copy.copy(Grad_vars[index])) + (1-beta2)*(param_update ** 2)
        
        #correcting bias 
        Grad_vars_corr[index] = np.maximum(copy.copy(Grad_vars_corr[index]),copy.copy(Grad_vars[index]))
        
        #updating parameters 
        param -= step*(copy.copy(Grad_means[index])/(np.sqrt(Grad_vars_corr[index])+epsilon))


    if np.array_equal(W_ih_prev,W_ih) and np.array_equal(W_hh_prev,W_hh) and np.array_equal(W_ho_prev,W_ho) and it>0:
        converged = True 
        #break
    else: 
        #increment iteration count by 1 and datapointer by size of unroll.
        it = it + 1
        st = st + no_steps_unroll
            

ITERATION 1
POETRY SAMPLE
----
 eeeeeleeteteeeeeleleeeeeteeeeleeiteteieee
tneeteeeeleeieeeeeeeetetenleeetlleetlti
nleeeeoleeeneeleeeeoeeeeeete
eeeeeee
eileeeeeeeeeeeeeeene
eeeee
teneeeoeeeeeie loeeeeeeeeeeeeleeeelleleeetleeet elieteelteeeienelel illelteelenenlteeneeelelleetieeeeleeoeeneelieeeeetlieieeeleoeelreleeneeeeeieeeeteeneel 
----
iteration: 75, loss: 135.464826
POETRY SAMPLE
----
 l s ii   lili   i l            lo le i li l   ie lllniiiini re  lli l ileco iiiii l  io i   ill  iil lioi  i   llmsnhl ilo   r lll   lil  i  ii lil hl i      lo    li  sle iill il  h tol o   iis li      li ir le l itilr ill lso rli ili iii lln     lii   olnriiitl  ali  o iiini fl    oio l  il iio ll 
----
iteration: 150, loss: 152.159536
POETRY SAMPLE
----
 sttrraurrrnrinrn rgrlotrtrstsstlglgtrlrtregtlerttllr nrtldteninrttarrtllrrttrnetegrrrtrnttirrlrnrrrrrrhritrrentritnrreunsrrrenrrrteiltvreartrrsgttnlirrstrrritggrrrttsrtartrrrittrtrrrtnrlrrrrntrrtrgtsrrrrrnsgtareaur rtrrhtrrtrgesrrtslrttrtlettrtitnrr

POETRY SAMPLE
----
hhlvlehhreollelehhleoepl ph ehrehg,rtteeoeolrrrreee nlrrrgrhleoprerrslrrhvmhtleoepshoiep i,ohgs  l l roooor,re
----
iteration: 1950, loss: 168.981894
POETRY SAMPLE
----
 
agyordotgragigoooehdg
ssloniratodngeggrlsdh nowg ghagoge rglsg onouaagh o  tatgnsgo gedaov
adtd;n uenlesahenygdolldoaeotn
nsgsgg oasnognnglgoaidr  ltuaoe g od tiloaaetnuggdgtehllot lgltoettsueubdwdtoogl
----
iteration: 2025, loss: 166.411392
POETRY SAMPLE
----
h.a heoc vrhh cecnWvct
 ee y WvtycvvW c vcccWc W eWdWeh  ft Achcutcy, eycW
et v edh taenldec Wuo
heepccvhleeh vces ldc   ,yWeWiW  eaWev,d A  s,hvcoc 
t
 cb,c ceWh  ce ne  hd cccdcbv cc h c
----
iteration: 2100, loss: 163.104501
POETRY SAMPLE
----
ymln iatiupitotiwdaw,pto espletaf
im lp sl, si aaiafnii cti,si  t rseoa i dt  ti,nptpi loofi,hiiioilrfteidniidd
 siil lip tua sf
  o i,upt tpn
ith ht io f,eyk tipoae
----
iteration: 2175, loss: 160.468997
POETRY SAMPLE
----
 u in uhueahTt brh dstnyaedtnhi tiTs
eiaiashuet,ntmeae tty vo
sashe smthvvhsa


POETRY SAMPLE
----
  G   i
 sasssea  h n s emeaiesnheev es et
ait ma  osnene a, amm m v e snstvAdars ehpatsss,e  se  s da  m n n  sneeeet tre  ss  dis a eve  ee
----
iteration: 4200, loss: 126.321582
POETRY SAMPLE
----
 crliIhbeis t heh,hii
  nhehtdnngiscoipoT oh
ooh a,  rns rhgttitis, eatytiis ietotiiihi htihdtphp,hh h  haetw nmihs gigdtteeetiihektiehn
tentahhhgiegeghdoghhiuytih enh it stogsethii ihnnegntmtngh ,bthhehteihtk
----
iteration: 4275, loss: 123.862664
POETRY SAMPLE
----
 luus t  o hy tt      
     t  t a dt   u   hg  ht  rh   r   nh n  k   ns
 
e
      ul    me,o  b                  s    u  u 
    l        tesTe   
 e  Ai      t     d h h    
 t  h  ub nt t  M   t h h   
h t     iM t  t o  n   c   m  t    o   w    o   tuu 
----
iteration: 4350, loss: 122.779076
POETRY SAMPLE
----
hiteheoe hliu?ttd?eredhOh?
erb, rOei?m?hrtltlmi? ehOr?rr?sehise u heerlwgOd??rlu
 e?i?ev
 dl?i  urh ho hh
eh ??ewgh  lerhesehtaO?  ?
imeoo  o
esuue?utmbOe??hh dmtlmHi?u?lsdlirueedokOehrrhuederd?t?h

POETRY SAMPLE
----
 ky yynyyyiv elbty ayy nnay yt  thony,woyl i h a
,ooye,yyHyl kye lak nloy k,yly y l ii l iiky,s  ,iby h ykykiyylsy l ei ywill   l  al i  ki l,el Tt i iyaihh li n  hyy y  eky  yye i oiait
i n    klikla ivhkiy l y taeeoi i yitiatk akykkyyy
i y  lidolbileelhii ni  okti ykilynf lyk y   
----
iteration: 6450, loss: 110.489135
POETRY SAMPLE
----
 ou,ulosoo, b
f unnwsgnumegigivuginioLvun, ioov
hi,is,oi, f ,ueidru.uotvoH
n,.uoul.nigroooou.,uho.,gg
uiD
spnu,nonO oTilo.uf  iufoolni, noln nn
 ni
 y  fIi
f b
au,tu oiuuioaouunsoufouupvi,ieoiionutiu ;uLooiedouwogwnn 
n,i
kuDun,gumou,ou 
----
iteration: 6525, loss: 109.784659
POETRY SAMPLE
----
 eeip tir e,iegey  h lin t eig,nn enn,raTfe g llggu,oi c 
cidtp sinioones  y fe nieg myno  , ,nhyyt
a    gnee,eg,rgiari ca w g 
ri eap vw  y ns  so,ih n ,e or  lf  ofstie en,ppsrs ,b gson: orrne   rnoo,a   giyainen eggaea i,io
tis,sti n eg,unekrpv, ,e es 
aure ig rirokan  a h,  s,,dii giihncg nre 
----
iteration: 6600, loss: 108.571075
POETR

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  loss = loss + -np.log(probs[c][targets[c],0])
  probs[c] = np.exp(outs[c]) / np.sum(np.exp(outs[c]))
  probs[c] = np.exp(outs[c]) / np.sum(np.exp(outs[c]))


POETRY SAMPLE


ValueError: probabilities contain NaN

## Part 3: Running ADAM (Adaptive Movement Estimation)

"Adam is a replacement optimization algorithm for SGD for training deep learning models. Adam combines the best properties of the AdaGrad and RMSProp algorithms to provide an optimization algorithm that can handle sparse gradients on noisy problems." (https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/#:~:text=Adam%20is%20a%20replacement%20optimization,sparse%20gradients%20on%20noisy%20problems.)

In [None]:
####PARAMETERS FOR ADAM 

step=0.01 #step size  
beta1=0.9 #first order exponential decay  
beta2=0.999 #second order exponential decay 
epsilon=1e-8 #to prevent dividing by zero

####Array to hold loss every 75 iterations 
Loss_ADAM = [0. for i in range(1500)]

In [None]:
#re-initialize weight matrices and bias terms
W_ih = np.random.standard_normal(size=(hidden_neurons,sz_char))*.01 #connecting input to hidden layer
W_hh =  np.random.standard_normal(size=(hidden_neurons,hidden_neurons))*.01 #connecting hidden to hidden layers
W_ho = np.random.standard_normal(size=(sz_char,hidden_neurons))*.01 #connecting hidden to output layer
b_h = np.zeros((hidden_neurons,1))
b_o = np.zeros((sz_char,1))

In [None]:
#loss at iteration 0 should be determined by size of vocabulary and the sequence size (number of times to unroll RNN)
smooth_loss = -np.log(1/sz_char) * no_steps_unroll 

#ADAM uses the mean (first moment) and varaince (second moment) of the gradients from previous time step; each parameter gets its own mean and var
Grad_means = [0 for i in range(5)]
Grad_vars = [0 for i in range(5)]

it = 1 #iteration number 
st = 0 #point at which to derive sample from data (data pointer)
count_ind = 0 #counter for storing loss every 75th iteration

#maximum number of iterations 
iter_ceiling = 200000000


converged = False

#while not converged:
while it <= 112500:    
    
    #reset memory of RNN if we are at iteration 0 OR if we have reached end of data strand 
    if it == 1:
        print("ITERATION 1")
        #initial hidden state to use after restarting 
        hidden_st = np.zeros((hidden_neurons,1))
        #start back @ beginning of data
        st = 0
    elif no_steps_unroll + st + 1 >= sz_dat:
        #initial hidden state to use after restarting 
        hidden_st = np.zeros((hidden_neurons,1))
        #start back @ beginning of data 
        st =0                         
    
    #slicing data to match sequence size/number of unrolls 
    input_slice_char = all_poems[st:(st + no_steps_unroll)]
    target_slice_char = all_poems[(st+1):(st+1+no_steps_unroll)]
    
    input_slice_int = np.zeros(len(input_slice_char),dtype=int)
    target_slice_int = np.zeros(len(target_slice_char),dtype=int)

    
    #define these slices in terms of where characters appear in dictionary 
    for c in range(len(input_slice_char)):
        input_slice_int[c] = characters.index(input_slice_char[c])
        target_slice_int[c] = characters.index(target_slice_char[c])
    
    #check gradient estimates (eventually remove break)
    #gradient_check(input_slice_int, target_slice_int, hidden_st)
    #break
    
    #print out samples of model progress every 75 iterations 
    if it % 75 == 0:
        print("POETRY SAMPLE")
        char_vec = np.zeros((sz_char,1))
        #represents seed (start with current character)
        char_vec[input_slice_int[0]] = 1
        char_indices = []
        h = np.copy(hidden_st)
        for ch in range(300):
            #forward pass through hidden layer
            h = np.tanh(np.dot(W_ih,char_vec)+np.dot(W_hh,h)+ b_h) 
            out_unnorm = np.dot(W_ho,h) + b_o #unnormalized log probabilities 
            out_norm = np.exp(out_unnorm) / np.sum(np.exp(out_unnorm)) #normalized log probabilities 
            #selects one character randomly, however taking into account probabilities of each character 
            char_index = np.random.choice(range(sz_char),p=out_norm.ravel())
            #one-hot vector representing selected character (for next iteration through for loop) 
            char_vec = np.zeros((sz_char,1))
            char_vec[char_index] = 1
            char_indices.append(char_index)
        
        #convert indices back to characters 
        sample = []
        for c in range(len(char_indices)):
            sample.append(characters[char_indices[c]])
        #append characters into continuous string 
        sample_txt = ''.join(sample)
        print('----\n %s \n----' % (sample_txt,))
        
        
    
    #calculate gradients from no_steps_unroll characters 
    dW_ih, dW_hh, dW_ho, d_b_hid, d_b_out, loss, hidden_st=loss_iterate(input_slice_int,target_slice_int,hidden_st,5)
    
    #smoothing loss so loss is averaged and not plotted/printed as erratic across iterations
    smooth_loss = (smooth_loss * .999) + (loss * .001)
    
    #print loss every 75 iterations 
    if it % 75 == 0:
        print("iteration: %d, loss: %f" %(it,smooth_loss))
        Loss_ADAM[count_ind] = smooth_loss
        count_ind += 1
    
    
    #copying weights before update to check for convergence 
    W_ih_prev = copy.deepcopy(W_ih)
    W_hh_prev = copy.deepcopy(W_hh)
    W_ho_prev = copy.deepcopy(W_ho)
    
    #updating weights 
    for param, param_update, index in zip([W_ih, W_hh, W_ho, b_h, b_o],
                                                    [dW_ih,dW_hh,dW_ho,d_b_hid,d_b_out],
                                                   [0,1,2,3,4]):

        #first-order exponential decay (momentum beta 1)
        Grad_means[index] = beta1*(copy.copy(Grad_means[index])) + (1-beta1)*param_update

        #second-order exponential decta (root mean sq beta 2)
        Grad_vars[index] = beta2*(copy.copy(Grad_vars[index])) + (1-beta2)*(param_update ** 2)
        
        #correcting bias 
        mean_grad_corr = Grad_means[index]/(1-beta1**it)
        var_grad_corr = Grad_vars[index]/(1-beta2**it)
        
        #updating parameters 
        param -= step*(mean_grad_corr/(np.sqrt(var_grad_corr)+epsilon))


    if np.array_equal(W_ih_prev,W_ih) and np.array_equal(W_hh_prev,W_hh) and np.array_equal(W_ho_prev,W_ho) and it>0:
        converged = True 
        #break
    else: 
        #increment iteration count by 1 and datapointer by size of unroll.
        it = it + 1
        st = st + no_steps_unroll
            

## Part 4: Running AdaGrad 
AdaGrad is a variant of Stochastic Gradient Descent (SGD) that automates the selection of the step_size parameter, which normally requires hand-tuning. It is dynamic because it uses large step sizes (faster learning) for parameters that relate to features infrequently encountered. Conversely, it uses smaller step sizes (slow learning) for parameters that relate to frequently encountered features.

In [None]:
####Array to hold loss every 75 iterations 
Loss_AdaGrad = [0. for i in range(1500)]

In [None]:
#re-initialize weight matrices and bias terms
W_ih = np.random.standard_normal(size=(hidden_neurons,sz_char))*.01 #connecting input to hidden layer
W_hh =  np.random.standard_normal(size=(hidden_neurons,hidden_neurons))*.01 #connecting hidden to hidden layers
W_ho = np.random.standard_normal(size=(sz_char,hidden_neurons))*.01 #connecting hidden to output layer
b_h = np.zeros((hidden_neurons,1))
b_o = np.zeros((sz_char,1))

In [None]:
#loss at iteration 0 should be determined by size of vocabulary and the sequence size (number of times to unroll RNN)
smooth_loss = -np.log(1/sz_char) * no_steps_unroll 

#Adagrad relies on memory (of previous observations) in addition to hidden states. Therefore we create memory versions of each model parameter
W_ih_mem = np.zeros_like(W_ih)
W_hh_mem = np.zeros_like(W_hh)
W_ho_mem = np.zeros_like(W_ho)
b_h_mem = np.zeros_like(b_h)
b_o_mem = np.zeros_like(b_o)


it = 0 #iteration number 
st = 0 #point at which to derive sample from data (data pointer)
count_ind = 0 #counter for saving loss every 75th iteration 

#maximum number of iterations 
iter_ceiling = 200000000


###Infinite loop. Stop when satisfied with sample
#while True:
while it <= 112500: 
    
    #reset memory of RNN if we are at iteration 0 OR if we have reached end of data strand 
    if it == 0:
        print("ITERATION 1")
        #initial hidden state to use after restarting 
        hidden_st = np.zeros((hidden_neurons,1))
        #start back @ beginning of data
        st = 0
    elif no_steps_unroll + st + 1 >= sz_dat:
        #initial hidden state to use after restarting 
        hidden_st = np.zeros((hidden_neurons,1))
        #start back @ beginning of data 
        st =0                         
    
    #slicing data to match sequence size/number of unrolls 
    input_slice_char = all_poems[st:(st + no_steps_unroll)]
    target_slice_char = all_poems[(st+1):(st+1+no_steps_unroll)]
    
    input_slice_int = np.zeros(len(input_slice_char),dtype=int)
    target_slice_int = np.zeros(len(target_slice_char),dtype=int)

    
    #define these slices in terms of where characters appear in dictionary 
    for c in range(len(input_slice_char)):
        input_slice_int[c] = characters.index(input_slice_char[c])
        target_slice_int[c] = characters.index(target_slice_char[c])
    
    #check gradient estimates (eventually remove break)
    #gradient_check(input_slice_int, target_slice_int, hidden_st)
    #break
    
    #print out samples of model progress every 75 iterations 
    if it % 75 == 0:
        print("POETRY SAMPLE")
        char_vec = np.zeros((sz_char,1))
        #represents seed (start with current character)
        char_vec[input_slice_int[0]] = 1
        char_indices = []
        h = np.copy(hidden_st)
        for ch in range(300):
            #forward pass through hidden layer
            h = np.tanh(np.dot(W_ih,char_vec)+np.dot(W_hh,h)+ b_h) 
            out_unnorm = np.dot(W_ho,h) + b_o #unnormalized log probabilities 
            out_norm = np.exp(out_unnorm) / np.sum(np.exp(out_unnorm)) #normalized log probabilities 
            #selects one character randomly, however taking into account probabilities of each character 
            char_index = np.random.choice(range(sz_char),p=out_norm.ravel())
            #one-hot vector representing selected character (for next iteration through for loop) 
            char_vec = np.zeros((sz_char,1))
            char_vec[char_index] = 1
            char_indices.append(char_index)
        
        #convert indices back to characters 
        sample = []
        for c in range(len(char_indices)):
            sample.append(characters[char_indices[c]])
        #append characters into continuous string 
        sample_txt = ''.join(sample)
        print('----\n %s \n----' % (sample_txt,))
        
        
    
    #calculate gradients from no_steps_unroll characters 
    dW_ih, dW_hh, dW_ho, d_b_hid, d_b_out, loss, hidden_st=loss_iterate(input_slice_int,target_slice_int,hidden_st,5)
    
    #smoothing loss so loss is averaged and not plotted/printed as erratic across iterations
    smooth_loss = (smooth_loss * .999) + (loss * .001)
    
    #print loss every 75 iterations 
    if it % 75 == 0:
        print("iteration: %d, loss: %f" %(it,smooth_loss))
        Loss_AdaGrad[count_ind] = smooth_loss
        count_ind += 1
    
    
    pad = 1e-8 ##To prevent dividing by zero
    
    for parameter, parameter_update, memory in zip([W_ih, W_hh, W_ho, b_h, b_o],
                                                    [dW_ih,dW_hh,dW_ho,d_b_hid,d_b_out],
                                                   [W_ih_mem,W_hh_mem,W_ho_mem,b_h_mem,b_o_mem]):
        memory += parameter_update * parameter_update
        parameter += -learning_rate * parameter_update / np.sqrt(memory + pad) 

    #increment iteration count by 1 and datapointer by size of unroll.
    it = it + 1
    st = st + no_steps_unroll
            

## Part 5: Plotting loss for each algorithm 

In [None]:
import matplotlib.pyplot as plt 

epochs = range(1500)
#epochs_to_iterations = range(0,112501,75)

plt.plot(epochs, Loss_AdaGrad, label='AdaGrad')
plt.plot(epochs, Loss_ADAM, label='ADAM')
plt.plot(epochs, Loss_AMSGrad, label = 'AMSGrad')
plt.title("Loss over time per optimization algorithm")
plt.xlabel("epoch (1 per 75 iterations)")
plt.ylabel("loss")
plt.legend()
fig = plt.gcf()
fig.set_size_inches(8, 6, forward=True)
plt.savefig("GDAlgorithmComparison_5.png",format='png',dpi=200)
plt.show()
