In [None]:
#https://gist.github.com/danijar/d11c77c5565482e965d1919291044470
#https://github.com/crestonbunch/neural-namer/blob/master/modeler/network.py
#https://danijar.com/variable-sequence-lengths-in-tensorflow/
#https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html

In [1]:
import numpy as np
import tensorflow as tf
from random import shuffle


In [2]:
data = open('dinos.txt', 'r').read()
data = data.lower()
chars = list(set(data)) + ['INIT', '@']
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 19909 total characters and 29 unique characters in your data.


In [3]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

{0: '\n', 1: '@', 2: 'INIT', 3: 'a', 4: 'b', 5: 'c', 6: 'd', 7: 'e', 8: 'f', 9: 'g', 10: 'h', 11: 'i', 12: 'j', 13: 'k', 14: 'l', 15: 'm', 16: 'n', 17: 'o', 18: 'p', 19: 'q', 20: 'r', 21: 's', 22: 't', 23: 'u', 24: 'v', 25: 'w', 26: 'x', 27: 'y', 28: 'z'}


In [4]:
def word_to_one_vector(word, char_to_ix, paddTo = 0):
    one_hot_vectors = np.zeros((len(word) + 1, len(char_to_ix)))
    one_hot_vectors[ np.arange(len(word)) + 1, [char_to_ix[ch] for ch in word] ]= 1
    one_hot_vectors[0, 1] = 2
    if paddTo > 0 and paddTo > one_hot_vectors.shape[0]:
        padding = np.zeros((paddTo - one_hot_vectors.shape[0] , len(char_to_ix)))
        #padding[0, 1] = 1
        one_hot_vectors = np.vstack((one_hot_vectors, padding))
    return one_hot_vectors

In [5]:
with open("dinos.txt") as f:
    examples = f.readlines()
examples = [x.lower().strip() for x in examples]

In [6]:
max_length = 0
for x in examples:
    if len(x) > max_length:
        max_length = len(x)
max_length += 1

In [7]:
#print(word_to_one_vector(examples[0],char_to_ix, 27), examples[0])

In [8]:
X_examples = np.empty((len(examples), max_length , vocab_size))
y_examples = np.empty((len(examples), max_length ))
lengths = []
for iter_number, example  in enumerate(examples):
    lengths.append(len(example) + 1)
    x_ = np.vstack((word_to_one_vector(example, char_to_ix, max_length)))
    X_examples[iter_number] = x_
    y_ = np.asarray([(char_to_ix[x]) for x in example + "@"])
    y_ = np.append(y_, np.zeros((max_length - 1 - len(example))))
    y_examples[iter_number] = y_
#print(X_examples[0], y_examples[0], lengths[0])
#print(X_examples.shape, y_examples.shape, len(lengths))


In [9]:
n_neurons = 100
n_steps = max_length
batch_size = 32

In [10]:
X = tf.placeholder(tf.float32, [None, None, vocab_size], name = "inputs")
y = tf.placeholder(tf.int32, [None, n_steps], name = "targets") #Shape => Batch_Size x Steps
seq_len = tf.placeholder(tf.int32, [None], name = "seq_lentgh")

In [11]:
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units =  n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype = tf.float32, sequence_length = seq_len)

In [12]:
stacked_outputs = tf.reshape(outputs, [-1, n_neurons])
stacked_outputs_dense = tf.layers.dense(stacked_outputs, vocab_size)
outputs_2 = tf.reshape(stacked_outputs_dense, [-1, n_steps, vocab_size])

In [13]:
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = outputs_2)
mask = tf.cast(tf.sign(y), tf.float32 ) 
xentropy *= mask
xentropy = tf.reduce_sum(xentropy, reduction_indices = 1)
xentropy /= tf.cast(seq_len, tf.float32)
loss = tf.reduce_mean(xentropy)

In [14]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.01)
gvs = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
training_op = optimizer.apply_gradients(capped_gvs)

In [15]:
init = tf.global_variables_initializer()

In [16]:
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(20000):
        for batch in range(len(examples)// batch_size):
            
            X_batch = X_examples[batch * batch_size: (batch + 1) * batch_size]
            y_batch = y_examples[batch * batch_size: (batch + 1) * batch_size]
            batch_lengths = lengths[batch * batch_size: (batch + 1) * batch_size]
            batch_dict = {X: X_batch, y: y_batch, seq_len: batch_lengths}
            sess.run(training_op, feed_dict = batch_dict)
            
        if epoch % 50 == 0:
            print(epoch, loss.eval(feed_dict = batch_dict))    
            for x in range(6):
                word = []
                x = np.zeros((n_steps,vocab_size)).reshape(-1, n_steps, vocab_size)
                x[0, np.arange(x.shape[1]),1] = 1
               
                for iter_number in range(40):
                    out = sess.run(outputs_2, feed_dict = {X: x[:,-n_steps:,:], seq_len: [n_steps]})
                    last = out[0,-1,:]
                    last_softmax = tf.nn.softmax(last).eval()

                    choice = np.random.choice(range(vocab_size), p = last_softmax)

                    one_hot = np.zeros(vocab_size)
                    one_hot[choice] = 1

                    x = np.append(x, one_hot.reshape(1,1,-1), axis=1)
                    if choice == 1:
                        break
                    word.append(ix_to_char[choice])
                word = ''.join(word) 
                print(word)
            print("\n")


0 3.1215272
acz

ijku
yeaf
aqapnINITiuu

sgnxfuzgvvarauamrhaauakbiaaaqudosumzmuo


50 2.4061973
oatosnuwls
aoaragopausus
oroctysosnus
acnumsai
eotros
oaooo


100 2.2783551
aiegt
esixisaurus
aopaimos
saiyas
aaeaslurus
rraonos


150 2.1856887
orudus
yahouraus
anusaur
iesmon
ihosalostur
hananaus


200 2.1154125
lirosairus
nhroslurus
otocevstur
ecopsaur
achaluo
enodon


250 2.0601244
iadingis
cheyodus
yaechosaurus
ruxcoraisis
lanocnn
tomisarhus


300 2.0150554
eennsaurus
olomei
odrus
lelosiurus
auaurus
oinmanos
ahiylnta


350 1.976813
yerasters
amisaura
lelipangus
onen
ocaurus
ianus


400 1.9432563
melostenuclus
echaerotek
belharvathion
zalisaurus
raptatau
erongosaurus


450 1.9128213
eosaurus
alaassangisaurus
cenerus
annrosaurus
husuluchongus
anysaeros


500 1.8844111
llanptosasaur
usnusaurus
ranntosanos
anakesaurus
angondon
edonyascarus


550 1.8574281
asaraos
lalodon
gnomemagus
niangkator
sarwisaerus
banicinisus


600 1.831697
neliegytan
todsaurus
yanus
tilong
yenlosaurus
ellenltyr


65

KeyboardInterrupt: 