In [1]:
#https://gist.github.com/danijar/d11c77c5565482e965d1919291044470
#https://github.com/crestonbunch/neural-namer/blob/master/modeler/network.py
#https://danijar.com/variable-sequence-lengths-in-tensorflow/
#https://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html

In [2]:
import numpy as np
import tensorflow as tf
from random import shuffle


In [3]:
data = open('BIBLIA COMPLETA.txt', 'r').read()
data = data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 4041446 total characters and 64 unique characters in your data.


In [4]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

{0: '\n', 1: ' ', 2: '!', 3: '#', 4: '(', 5: ')', 6: '*', 7: ',', 8: '-', 9: '.', 10: '/', 11: '0', 12: '1', 13: '2', 14: '3', 15: '4', 16: '5', 17: '6', 18: '7', 19: '8', 20: '9', 21: ':', 22: ';', 23: '=', 24: '?', 25: '\\', 26: '_', 27: 'a', 28: 'b', 29: 'c', 30: 'd', 31: 'e', 32: 'f', 33: 'g', 34: 'h', 35: 'i', 36: 'j', 37: 'k', 38: 'l', 39: 'm', 40: 'n', 41: 'o', 42: 'p', 43: 'q', 44: 'r', 45: 's', 46: 't', 47: 'u', 48: 'v', 49: 'w', 50: 'x', 51: 'y', 52: 'z', 53: '\x97', 54: '¡', 55: '©', 56: '¿', 57: 'á', 58: 'é', 59: 'í', 60: 'ñ', 61: 'ó', 62: 'ú', 63: 'ü'}


In [5]:
def to_one_vector(characters, char_to_ix, paddTo = 0):
    one_hot_vectors = np.zeros((len(characters), len(char_to_ix)))
    one_hot_vectors[ np.arange(len(characters)), [char_to_ix[ch] for ch in characters] ]= 1
    return one_hot_vectors


In [6]:
print(to_one_vector(data[:100], char_to_ix))
print(data[:100])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
la santa biblia, antiguo testamento, versión de casiodoro de reina (1569) revisada por cipriano de v


In [7]:
batch_iteration = 0
def next_batch(data, char_to_ix, n_steps, batch_size):
    global batch_iteration
    if batch_iteration >= len(data) // (n_steps * batch_size):
        batch_iteration = 0
    X_batch = np.empty((batch_size, n_steps , vocab_size))
    y_batch = np.empty((batch_size, n_steps ))
    data_offset = (batch_iteration * batch_size)
    for batch_example in range(batch_size):
        data_chunk = data[ (batch_example +  data_offset) * n_steps : (batch_example +  data_offset + 1) * n_steps]
        x_ = to_one_vector( data_chunk , char_to_ix)
        X_batch[batch_example] = x_
        y_batch[batch_example] = np.asarray([char_to_ix[c] for c in data_chunk[1:] + data[(batch_example +  data_offset + 1) * n_steps + 1]])
    batch_iteration += 1
    return X_batch, y_batch

In [8]:
n_neurons = 100
batch_size = 32
n_steps = 500

In [9]:
X = tf.placeholder(tf.float32, [None, None, vocab_size], name = "inputs")
y = tf.placeholder(tf.int32, [None, n_steps], name = "targets") #Shape => Batch_Size x Steps

In [10]:
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units =  n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype = tf.float32)

In [11]:
stacked_outputs = tf.reshape(outputs, [-1, n_neurons])
stacked_outputs_dense = tf.layers.dense(stacked_outputs, vocab_size)
outputs_2 = tf.reshape(stacked_outputs_dense, [-1, n_steps, vocab_size])

In [12]:
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = outputs_2)
loss = tf.reduce_mean(xentropy)

In [13]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.01)
gvs = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
training_op = optimizer.apply_gradients(capped_gvs)

In [14]:
init = tf.global_variables_initializer()

In [None]:
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(20000):
         
        X_batch, y_batch = next_batch(data, char_to_ix, n_steps, batch_size)
        batch_dict = {X: X_batch, y: y_batch}
        sess.run(training_op, feed_dict = batch_dict)
            
        if epoch % 1000 == 0 and epoch != 0:
            print(epoch, loss.eval(feed_dict = batch_dict))    
            sentence = []
            x = np.zeros((n_steps,vocab_size)).reshape(-1, n_steps, vocab_size)
            x[0][-1][np.random.randint(x.shape[2])] = 1
            for iter_number in range(300):
                out = sess.run(outputs_2, feed_dict = {X: x[:,-n_steps:,:]})
                last = out[0,-1,:]
                last_softmax = tf.nn.softmax(last).eval()

                choice = np.random.choice(range(vocab_size), p = last_softmax)

                one_hot = np.zeros(vocab_size)
                one_hot[choice] = 1

                x = np.append(x, one_hot.reshape(1,1,-1), axis=1)
                sentence.append(ix_to_char[choice])
            sentence = ''.join(sentence) 
            print(sentence)
            


1000 2.9834738
e;io oi , ,ueosur co hnodl  lstfe. u olulrrnnin  dnsuio ondar en o a ; é;e©edms2ieocso  árr .vtfj d n ma jgv  u v selb ai dtaech3an coeaeo eed  itajp aa  deijoni¿a ealel o :ptr 2s lscdlsn,it
tódhacysqfa eehldmcáuo  risnaoao,b?aiqesqnid deicqlsreyyi e úó#iaeerb nnoi t e yr ps5dssnéa ossso oeoaufn
ln 
2000 2.9419136
d ee a eznnoe.ae  eeyur no  lst mc pen
sue2 eoz rudéatdtrra a pi sanna biób udrn s ro)érde,j éa=i  a dáo  na  enrnrincol üosíea abiyn3sorololódíeqcer slo t.ápirecmnd dlaeim6ssn,bl ah ulmmuv léhuo o. s u ahatass cdnn loe rii:y aleahd cdpet sódets au ssn.r reeaei étoaa atye ir ddl  dles
2a v-r/md o2 
3000 2.791563
 oeas 1e ae pe e,ss s sud mpovraátarycárcsla b
 lol grdpdr.  eueaad a losoajseaca saq neo.isn lcse c e1lmcüraor uaunrj ae¿s 6br1 yoci v:toúoae diqra oerst co ai 
aste de¿mo 2ñ#z lpelisa actod na ar ,s ,do a
ia0 )ori*s sfo ho deo,td8ob eeleó moraos aua ten
z8e
8a osüae aoailde dtula tu yos
usao sxs 
4000 2.6810672
map, lnejoecof vc rloet án sa kes nu c