

# Clase 3 NLP: Ejemplor de RNN aplicado a sopa de letras


In [None]:
import random
import string
import numpy as np
import tensorflow as tf
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, SimpleRNN, Bidirectional
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers import Embedding
from rich import print

In [None]:
# Definimos el conjunto de palabras que va
palabras = ["arbol", "hoja", "salto", "luz", "estalactita", "aproximacion", "gesticulador", "insolacion"]
print("╔════════════════════╗")
print("║Palabras a encontrar║")
print("╚════════════════════╝")
for i, word in enumerate(palabras, 1):
    print(f"  {i}. [ {word} ]")
print("✨" * 20)


In [None]:
# --- Crear vocabulario de caracteres (solo letras ascii minúsculas) ---
vocab_chars = sorted(list(set(string.ascii_lowercase)))
char_to_idx = {c: i for i, c in enumerate(vocab_chars)}
char_to_idx

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25}

In [None]:

# --- Generar todos los 2-grams posibles del vocabulario ---
bigrams = [a+b for a in vocab_chars for b in vocab_chars]
bigram_to_idx = {bg: i for i, bg in enumerate(bigrams)}
idx_to_bigram = {i: bg for bg, i in bigram_to_idx.items()}
#Make a function that sends a list of indexes to the original strin
def idx_to_string(indexes):
    return ''.join([idx_to_bigram[i][0] for i in indexes])

vocab_size = len(bigrams)  # tamaño input para one-hot
print(vocab_size)
print(bigrams[0:10])
print( bigram_to_idx['gh'])

In [None]:
longitud_sopa = 100

In [None]:
# --- Generar sopa con una sola palabra ---
def generar_sopa(palabras, longitud):
    palabra = random.choice(palabras)
    posicion = random.randint(0, longitud - len(palabra))
    sopa = ''
    for i in range(posicion):
        sopa += random.choice(vocab_chars)
    sopa += palabra
    for i in range(posicion + len(palabra), longitud):
        sopa += random.choice(vocab_chars)
    return sopa, palabra


In [None]:
for i in range(6):
  print( generar_sopa( palabras, longitud = longitud_sopa ) )

In [None]:
# --- Codificar string en secuencia de 2-grams one-hot ---
def encode_2grams(s, bigram_to_idx=bigram_to_idx):
    n = len(s) - 1
    encoded = np.zeros((n, vocab_size), dtype=np.float32)
    for i in range(n):
        bg = s[i:i+2]
        idx = bigram_to_idx.get(bg, None)
        if idx is not None:
            encoded[i, idx] = 1.0
    return encoded
    # --- Codificar sopa como secuencia de índices de 2-grams ---
def encode_2grams_idx(s):
    return [bigram_to_idx[s[i:i+2]] for i in range(len(s) - 1)]


In [None]:
encode_2grams_idx("aproximacion")

[15, 407, 456, 387, 606, 220, 312, 2, 60, 222, 377]

In [None]:

# --- Crear dataset ---
N = 10000
seq_len = longitud_sopa-1 # porque la entrada son 2-grams, n-1 respecto a longitud string 100
X = np.zeros((N, seq_len), dtype=np.int32)
y = np.zeros((N,), dtype=np.int32)
palabra_to_idx = {p: i for i, p in enumerate(palabras)}
for i in range(N):
    sopa, palabra = generar_sopa(palabras, longitud = longitud_sopa)
    X[i] = encode_2grams_idx(sopa)
    y[i] = palabra_to_idx[palabra]

print("Dataset generado:")
print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
X = X.reshape(N,longitud_sopa-1,1)

In [None]:
input_shape = X[0].shape
print(input_shape)

# input shape for Simple RNN [batch, timesteps, feature].

In [None]:
model = Sequential()
model.add(Bidirectional(SimpleRNN(100, activation='relu'), input_shape = input_shape)) #ht = σ(Whh * ht-1 + Whx * x + bh)
model.add(Dense(len(palabras), activation='softmax') )

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

  super().__init__(**kwargs)


In [None]:
model.summary()

In [None]:
# --- Entrenar modelo ---
model.fit(X, y, epochs=10, batch_size=256, validation_split=0.2)

Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 119ms/step - accuracy: 0.1350 - loss: 2.9010 - val_accuracy: 0.1200 - val_loss: 2.7761
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 102ms/step - accuracy: 0.1386 - loss: 2.7032 - val_accuracy: 0.1095 - val_loss: 2.6097
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 101ms/step - accuracy: 0.1411 - loss: 2.5355 - val_accuracy: 0.1260 - val_loss: 2.6612
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 142ms/step - accuracy: 0.1387 - loss: 2.5158 - val_accuracy: 0.1185 - val_loss: 2.5130
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 99ms/step - accuracy: 0.1420 - loss: 2.4161 - val_accuracy: 0.1150 - val_loss: 2.4121
Epoch 6/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 101ms/step - accuracy: 0.1356 - loss: 2.3368 - val_accuracy: 0.1150 - val_loss: 2.5079
Epoch 7/10
[1m32/32[0m [32

<keras.src.callbacks.history.History at 0x796433134c50>

In [None]:


# --- Testing ---
def test_model(modelo, palabras, palabra_to_idx, bigram_to_idx, vocab_chars, num_tests=10):
    correct = 0
    for _ in range(num_tests):
        sopa, palabra_real = generar_sopa(palabras, longitud = longitud_sopa)
        x_test = encode_2grams_idx(sopa)
        x_test = np.expand_dims(x_test, axis=0)
        pred = modelo.predict(x_test, verbose=0)
        palabra_pred = palabras[np.argmax(pred)]
        acierto = palabra_pred == palabra_real
        correct += acierto
        print(f"Sopa: {sopa}")
        print(f"Real: {palabra_real} | Predicha: {palabra_pred} | Correcto: {acierto}")
        print("-" * 40)
    print(f"Accuracy sobre {num_tests} sopas: {correct / num_tests:.2f}")

# --- Evaluación ---
test_model(model, palabras, palabra_to_idx, bigram_to_idx, vocab_chars, num_tests=10)
