In [1]:


import numpy as np
from utils import *
import random



In [2]:


data = open('igc50.txt', 'r').read()
#data = data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))



There are 34137 total characters and 27 unique characters in your data.


In [3]:
char_to_ix = dict((c,i) for i,c in enumerate(chars))
ix_to_char = dict((i,c) for i,c in enumerate(chars))
print(ix_to_char)

print(char_to_ix)

{0: '2', 1: '/', 2: 'z', 3: 'O', 4: '#', 5: '=', 6: 'P', 7: 'H', 8: '3', 9: '4', 10: 'S', 11: '+', 12: 'C', 13: '\n', 14: '\\', 15: ')', 16: 'I', 17: '(', 18: '1', 19: 'r', 20: 'l', 21: 'F', 22: '[', 23: ']', 24: 'B', 25: '-', 26: 'N'}
{'2': 0, '/': 1, 'z': 2, 'O': 3, '#': 4, '=': 5, 'P': 6, 'H': 7, '3': 8, '4': 9, 'S': 10, '+': 11, 'C': 12, '\n': 13, '\\': 14, ')': 15, 'I': 16, '(': 17, '1': 18, 'r': 19, 'l': 20, 'F': 21, '[': 22, ']': 23, 'B': 24, '-': 25, 'N': 26}


In [4]:
def clip(gradients, maxValue):
    
    
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']
   

    for gradient in [dWax, dWaa, dWya, db, dby]:
        np.clip(gradient, -maxValue, maxValue, out=gradient)
   
    
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients


In [5]:


def sample(parameters, char_to_ix, seed):

    
    # Retrieve parameters and relevant shapes from "parameters" dictionary
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    vocab_size = by.shape[0]
    n_a = Waa.shape[1]
   
   
    # Step 1: Create the one-hot vector x for the first character (initializing the sequence generation). (≈1 line)
    x = np.zeros((vocab_size, 1))
    # Step 1': Initialize a_prev as zeros (≈1 line)
    a_prev = np.zeros((n_a, 1))
    
    # Create an empty list of indices, this is the list which will contain the list of indices of the characters to generate (≈1 line)
    indices = []
    
    # Idx is a flag to detect a newline character, we initialize it to -1
    idx = -1 
    
    # Loop over time-steps t. At each time-step, sample a character from a probability distribution and append 
    # its index to "indices". We'll stop if we reach 50 characters (which should be very unlikely with a well 
    # trained model), which helps debugging and prevents entering an infinite loop. 
    counter = 0
    newline_character = char_to_ix['\n']
    
    while (idx != newline_character and counter != 200):
        
        # Step 2: Forward propagate x using the equations (1), (2) and (3)
        a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b)
        z = np.dot(Wya, a) + by
        y = softmax(z)
        
        # for grading purposes
        np.random.seed(counter + seed) 
        
        # Step 3: Sample the index of a character within the vocabulary from the probability distribution y
        idx = (np.random.choice(list(range(vocab_size)), p=y.ravel()))

        # Append the index to "indices"
        indices.append(idx)
        
        # Step 4: Overwrite the input character as the one corresponding to the sampled index.
        x = np.zeros((vocab_size, 1))
        x[idx] = 1
        
        # Update "a_prev" to be "a"
        a_prev = a
        
        # for grading purposes
        seed += 1
        counter +=1
        


    if (counter == 200):
        indices.append(char_to_ix['\n'])
    
    return indices


In [6]:

def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):

    
    # Forward propagate through time (≈1 line)
    loss, cache = rnn_forward(X, Y, a_prev, parameters)
    
    # Backpropagate through time (≈1 line)
    gradients, a = rnn_backward(X, Y, parameters, cache)
    
    # Clip your gradients between -5 (min) and 5 (max) (≈1 line)
    gradients = clip(gradients, 5)
    
    # Update parameters (≈1 line)
    parameters = update_parameters(parameters, gradients, learning_rate)
    

    
    return loss, gradients, a[len(X)-1]

In [11]:




def model(data, ix_to_char, char_to_ix, num_iterations = 3500000, n_a = 50, mol_names = 7, vocab_size = vocab_size):

    
    # Retrieve n_x and n_y from vocab_size
    n_x, n_y = vocab_size, vocab_size
    
    # Initialize parameters
    parameters = initialize_parameters(n_a, n_x, n_y)
    
    # Initialize loss (this is required because we want to smooth our loss, don't worry about it)
    loss = get_initial_loss(vocab_size, mol_names)
    
    # Build list of all dinosaur names (training examples).
    with open("igc50.txt") as f:
        examples = f.readlines()
    #examples = [x.lower().strip() for x in examples]
    
    # Shuffle list of all dinosaur names
    np.random.seed(0)
    np.random.shuffle(examples)
    
    # Initialize the hidden state of your LSTM
    a_prev = np.zeros((n_a, 1))
    
    # Optimization loop
    for j in range(num_iterations):
        
        ### START CODE HERE ###
        
        # Use the hint above to define one training example (X,Y) (≈ 2 lines)
        index = j % len(examples)
        X = [None] + [char_to_ix[ch] for ch in examples[index]] 
        Y = X[1:] + [char_to_ix["\n"]]
        
        # Perform one optimization step: Forward-prop -> Backward-prop -> Clip -> Update parameters
        # Choose a learning rate of 0.01
        curr_loss, gradients, a_prev = optimize(X, Y, a_prev, parameters)
        
    
        
        # Use a latency trick to keep the loss smooth. It happens here to accelerate the training.
        loss = smooth(loss, curr_loss)

        # Every 2000 Iteration, generate "n" characters thanks to sample() to check if the model is learning properly
        if j % 2000 == 0:
            
            print('Iteration: %d, Loss: %f' % (j, loss) + '\n')
            
            # The number of dinosaur names to print
            seed = 0
            for name in range(mol_names):
                
                # Sample indices and print them
                sampled_indices = sample(parameters, char_to_ix, seed)
                print_sample(sampled_indices, ix_to_char)
                
                seed += 1  # To get the same result for grading purposed, increment the seed by one. 
      
            print('\n')
        
    return parameters



In [None]:


parameters = model(data, ix_to_char, char_to_ix)



Iteration: 0, Loss: 23.123591

\+NB]l#
+\=z2#l[Hz/
+NB]l#
\=z2#l[Hz/
NB]l#
=z2#l[Hz/
B]l#


Iteration: 2000, Loss: 25.034327

CCl)C)=CC=CC(=C1)O)CCCCCOC1=CC=O
CCCCOC(=O)OC(=O)OC1=CC1=CC=CSF)C=CC=C([O-])=O
Cl)C
CC=O
BN](=O)CCC1=CCC#CC=C1)O
C#O
C1=CC=C\O1


Iteration: 4000, Loss: 19.552357

C(N)(CCCC
C(=O
C1=CC=CC=C2(Cl)[O-](=O)OC(=N)O=CC=CC=C2
CC3O
Br)C=CC=CC(=C1)O)CS=C\O
C=O
(C)CCC


Iteration: 6000, Loss: 17.868055

C(l)(C)CC(C)[O-]1=CC=C(C#
C(=O)C1=CC=C(Cl)C
C1=C(Cl)C(=C(Cl)OCCC
CCCOC(C)=O
[N](=O)C1=CC(=CC(=CC(=O)C(B)=O
C=O
C1=CC=C(Cl)C(C


Iteration: 8000, Loss: 17.278458

C(Br)C=C1
C(=O)C1=CC=C(Cl)C=C1
C1=CC=CC=C1
CCCOC(C)=O
[N](=O)C(=C(Cl)C(=C1)[O](=O)=O
C=O
C1=CC=C(Cl)C=CC


Iteration: 10000, Loss: 17.130254

CCBr
C(=O
C1=C(O)C=CC(=C1)O
CC#O
N)(CC
C=O
[N](=O)=O)Cl


Iteration: 12000, Loss: 16.800928

C(C)(C)CCC
C(=O)OC(=O)C1=CC=C1
C1=CC=CC=C1l
CC#O
N)C1=CC=CC(=C1)O)C
C=O
[N](=O)=O)CCC(O)=O


Iteration: 14000, Loss: 20.508833

C(Cl)N+
C(=O
C1=C(C=C1
C=C2
Br)N=CCCCCN
C#O
[N+](=O)=N