# Red Neuronal Recurrente y Procesamiento del Lenguaje Natural

In [12]:
# Librerías necesarias
import os, glob

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.data import Dataset
from tensorflow.keras import losses
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, GRU, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

### 1. Crear una lista con las palabras a generar

In [13]:
data = pd.read_csv('datasets/nombres.csv')
data

Unnamed: 0,Nombre
0,Alejandro
1,Valeria
2,Mateo
3,Sofía
4,Daniel
...,...
95,Salma
96,Simón
97,Luna
98,Thiago


In [14]:
# Unir los nombres en un párrafo
text = ' '.join([data.iloc[i, 0].lower() for i in data.index])
text

'alejandro valeria mateo sofía daniel camila andrés isabella sebastián mariana diego natalia gabriel fernanda samuel lucía leonardo victoria emiliano regina rodrigo ximena julián paulina nicolás renata ángel carolina tomás alejandra benjamín jimena jorge elisa adrián mónica luis gabriela ricardo andrea bruno claudia mauricio laura alan patricia iván teresa óscar beatriz aarón cecilia esteban silvia raúl rosa enrique julia marcos ángela rafael lourdes hugo fabiola sergio daniela arturo bianca eduardo pamela manuel araceli francisco verónica alonso miriam cristian karla joel yesenia martín rocío césar estrella israel danna kevin aitana jonathan zoe dante abril gael alma elías salma simón luna thiago aurora'

### 2. Preparar los datos y crear el vocabulario (preprocesamiento)

In [15]:
# Vocabulario
vocab = sorted(set(text))
char2idx = {c:i for i, c in enumerate(vocab)}
idx2char = np.array(vocab)

print(f'Vocabulario: {len(vocab)} caracteres únicos')
print(f'Caracteres: {vocab}')

Vocabulario: 31 caracteres únicos
Caracteres: [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', 'á', 'é', 'í', 'ó', 'ú']


In [16]:
# Ejemplo de nombre codificado
name = data.iloc[0, 0].lower()
print(f'Nombre: {name}')
print(f'Codificación: {[char2idx[c] for c in name]}')

Nombre: alejandro
Codificación: [1, 12, 5, 10, 1, 14, 4, 18, 15]


In [17]:
# Texto codificado
encoded = np.array([char2idx[c] for c in text])
print(encoded)

[ 1 12  5 10  1 14  4 18 15  0 22  1 12  5 18  9  1  0 13  1 20  5 15  0
 19 15  6 28  1  0  4  1 14  9  5 12  0  3  1 13  9 12  1  0  1 14  4 18
 27 19  0  9 19  1  2  5 12 12  1  0 19  5  2  1 19 20  9 26 14  0 13  1
 18  9  1 14  1  0  4  9  5  7 15  0 14  1 20  1 12  9  1  0  7  1  2 18
  9  5 12  0  6  5 18 14  1 14  4  1  0 19  1 13 21  5 12  0 12 21  3 28
  1  0 12  5 15 14  1 18  4 15  0 22  9  3 20 15 18  9  1  0  5 13  9 12
  9  1 14 15  0 18  5  7  9 14  1  0 18 15  4 18  9  7 15  0 23  9 13  5
 14  1  0 10 21 12  9 26 14  0 16  1 21 12  9 14  1  0 14  9  3 15 12 26
 19  0 18  5 14  1 20  1  0 26 14  7  5 12  0  3  1 18 15 12  9 14  1  0
 20 15 13 26 19  0  1 12  5 10  1 14  4 18  1  0  2  5 14 10  1 13 28 14
  0 10  9 13  5 14  1  0 10 15 18  7  5  0  5 12  9 19  1  0  1  4 18  9
 26 14  0 13 29 14  9  3  1  0 12 21  9 19  0  7  1  2 18  9  5 12  1  0
 18  9  3  1 18  4 15  0  1 14  4 18  5  1  0  2 18 21 14 15  0  3 12  1
 21  4  9  1  0 13  1 21 18  9  3  9 15  0 12  1 21

### 3. Crear datos para entrenar la red (secuencias)

In [18]:
seq_length = 16
char_dataset = tf.data.Dataset.from_tensor_slices(encoded)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(chunk):
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

BATCH_SIZE = 32
BUFFER_SIZE = 1000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

### 4. Crear red neuronal recurrente

In [19]:
vocab_size = len(vocab)
embedding_dim = 128
rnn_units = 512

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return Sequential([
        Input(batch_shape=(batch_size, None)),
        Embedding(vocab_size, embedding_dim),
        GRU(rnn_units, return_sequences = True, stateful = True,
            recurrent_initializer = 'glorot_uniform'),
        GRU(rnn_units // 2, return_sequences = True, stateful = True,
            recurrent_initializer = 'glorot_uniform'),
        Dense(vocab_size)
    ])

model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

def loss(labels, logits):
    return losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

model.compile(optimizer='adam', loss=loss)

checkpoint_dir = './training_checkpoints'
os.makedirs(checkpoint_dir, exist_ok = True)

checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch:02d}.weights.h5')

checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)

EPOCHS = 100
model.fit(dataset, epochs = EPOCHS, callbacks = [checkpoint_callback])

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 3.4344
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step - loss: 3.4029
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step - loss: 3.3540
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step - loss: 3.2532
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step - loss: 3.0565
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step - loss: 3.1293
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step - loss: 3.1187
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step - loss: 2.8809
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step - loss: 2.9652
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203ms/step - loss: 3.0296
Ep

<keras.src.callbacks.history.History at 0x152aef7bd70>

### 5. Generar nombres


In [20]:
gen_model = build_model(vocab_size, embedding_dim, rnn_units, batch_size = 1)
checkpoints = glob.glob(os.path.join(checkpoint_dir, '*.weights.h5'))

if checkpoints:
    latest = max(checkpoints, key = os.path.getctime)
    print(f'Último checkpoint encontrado: {latest}')
    gen_model.load_weights(latest)
else:
    print('No se encontró ningún checkpoint, entrena primero el modelo.')

Último checkpoint encontrado: ./training_checkpoints\ckpt_100.weights.h5


In [21]:
def generate_text(model, start_string, num_generate = seq_length, temperature = 1.0):
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    
    for layer in model.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions[-1] / temperature
        predicted_id = tf.random.categorical(tf.expand_dims(predictions, 0),
                                             num_samples = 1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0)

        if idx2char[predicted_id] == ' ':
            break
        
    return start_string + ''.join(text_generated)

In [22]:
# Temperature < 1.0: Más conservador, nombres similares a los datos
# Temperature = 1.0: Balance entre creatividad y coherencia
# Temperature > 1.0: Más aleatorio y creativo

# Ejemplos de generación
for temp in [0.5, 1.0, 1.2]:
    print(f'Nombres con temperatura = {temp}')
    for _ in range(10):
        print(generate_text(gen_model, start_string = 's',
                            num_generate = seq_length, temperature = temp))

Nombres con temperatura = 0.5
smia 
s 
sde 
sí 
sel 
s 
sica 
s 
sxéa 
s 
Nombres con temperatura = 1.0
se 
s 
sejsaro 
snicarianjro 
sma 
sdiela 
svec 
só 
sto 
s 
Nombres con temperatura = 1.2
sú 
sbáel 
sbeel 
sc 
stó 
sginatdo 
stcla 
seójandro 
sixóna 
sricás 
