### DATASET

https://www.kaggle.com/datasets/terminate9298/gutenberg-poetry-dataset

In [7]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd
import numpy as np
import tensorflow as tf
import os

# Descargar el dataset con losp oemas
path = kagglehub.dataset_download("terminate9298/gutenberg-poetry-dataset")

# Verificar la ruta
print("Ruta del dataset:", path)
print("\nArchivos en el dataset:")
print(os.listdir(path))

csv_path = os.path.join(path, "Gutenberg-Poetry.csv")

df = pd.read_csv(csv_path)
poemas = df['s'].dropna().astype(str).tolist()

# Convertir todo a minúsculas y unirlo
corpus = " ".join(poemas).lower()

print("Primeras filas en el dataset:", df.head())

Downloading from https://www.kaggle.com/api/v1/datasets/download/terminate9298/gutenberg-poetry-dataset?dataset_version_number=1...


100%|██████████| 112M/112M [00:00<00:00, 121MB/s]

Extracting files...





Ruta del dataset: /root/.cache/kagglehub/datasets/terminate9298/gutenberg-poetry-dataset/versions/1

Archivos en el dataset:
['Gutenberg-Poetry.csv', 'gutenberg-poetry-v001.ndjson']
Primeras filas en el dataset:    Unnamed: 0                                                  s  gid
0           0  The Song of Hiawatha is based on the legends a...   19
1           1  many North American Indian tribes, but especia...   19
2           2  Ojibway Indians of northern Michigan, Wisconsi...   19
3           3  They were collected by Henry Rowe Schoolcraft,...   19
4           4  Schoolcraft married Jane, O-bah-bahm-wawa-ge-z...   19


In [9]:
# Tokenización
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in poemas:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Padding
max_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='pre')

# Creación de etiquetas
X, y = input_sequences[:,:-1], input_sequences[:,-1]

In [None]:
# Construcción del modelo RNN con LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 10, input_length=max_length-1),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

# Entrenamiento
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks = [
    EarlyStopping(monitor='loss', patience=5, restore_best_weights=True),
    ModelCheckpoint("mejor_modelo.h5", save_best_only=True)
]

model.fit(X, y, epochs=2, verbose=1, callbacks=callbacks) # únicamente 2 épocas porque tardó varias horas en procesar

Epoch 1/2
[1m 61150/598353[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m97:19:27[0m 652ms/step - accuracy: 0.0711 - loss: 7.6656

In [None]:
# Función con temperatura para generación más creativa
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

# Función para generar el poema
def generar_poema(seed_text, next_words, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_length-1, padding='pre')
        predictions = model.predict(token_list, verbose=0)[0]
        predicted = sample_with_temperature(predictions, temperature)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [None]:
# Prueba de generación con indicaciones del usuario
inicio = input("Introduce el comienzo del poema en inglés: ")
cantidad_palabras = int(input("¿Cuántas palabras deseas generar?: "))
temperatura = float(input("Temperatura (0.1 - 1): "))

# Genera el poema
print("\nPoema generado:\n")
print(generar_poema(inicio, cantidad_palabras, temperatura))