In [1]:
import requests
import fitz  # PyMuPDF
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import LambdaCallback
import sys

# Download the PDF
url = "https://josefaruiztagle.cl/wp-content/uploads/2020/05/cuentos-borges.pdf"
response = requests.get(url)
pdf_path = "cuentos_borges.pdf"
with open(pdf_path, "wb") as file:
    file.write(response.content)

# Read the PDF
doc = fitz.open(pdf_path)
text = ""
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    text += page.get_text()

text = text.lower()
print('Corpus length:', len(text))

# Create a mapping from characters to indices and vice versa
chars = sorted(list(set(text)))
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

# Prepare the dataset
maxlen = 40
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

print('Number of sequences:', len(sentences))

X = np.zeros((len(sentences), maxlen), dtype=np.int32)
y = np.zeros((len(sentences), len(chars)), dtype=np.float32)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t] = char_indices[char]
    y[i, char_indices[next_chars[i]]] = 1

# Build the model
model = Sequential([
    Embedding(len(chars), 50, input_length=maxlen),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(len(chars), activation='softmax')
])

optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

# Helper function to sample an index from a probability array
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-7) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Callback to generate text after each epoch
def on_epoch_end(epoch, logs):
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    start_index = np.random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    print('----- Seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(400):
        x_pred = np.zeros((1, maxlen))
        for t, char in enumerate(sentence):
            x_pred[0, t] = char_indices[char]

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature=0.5)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# Train the model
model.fit(X, y, batch_size=128, epochs=60, callbacks=[print_callback])


Corpus length: 109278
Number of sequences: 36413




Epoch 1/60
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - loss: 2.9932
----- Generating text after Epoch: 0
----- Seed: "memoria; me entorpeció el temor de multi"
memoria; me entorpeció el temor de multiras 
lay degra 
de pirer alpurior, lo en hanile 
una par el 
dapuris parer en vermiba en me 
perpomro 
qui de el 
sanpuisa, de de muociar 
enla dimra amerda cura, en de sapivos y 
parira la la mapura purar por 
prastar de parra 
elpura de 



de 
nabariro la la 
tarparaco el ponpira de puna en elira pora ra porpiras, des el a demper 

lal debriardo 
que puroria, la pan parre la pomerpera poria de 
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 165ms/step - loss: 2.9921
Epoch 2/60
[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - loss: 2.2682
----- Generating text after Epoch: 1
----- Seed: "quiel, de un ángel de cuatro caras que a"
quiel, de un ángel de cuatro caras que al la escero 
oratia de la dentenva en 

<keras.src.callbacks.history.History at 0x228541496d0>

In [4]:
generated = ''
sentence = text[0: 0 + maxlen]
generated += sentence
print('----- Seed: "' + sentence + '"')
sys.stdout.write(generated)

for i in range(400):
    x_pred = np.zeros((1, maxlen))
    for t, char in enumerate(sentence):
        x_pred[0, t] = char_indices[char]

    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, temperature=0.5)
    next_char = indices_char[next_index]

    generated += next_char
    sentence = sentence[1:] + next_char

    sys.stdout.write(next_char)
    sys.stdout.flush()
print()

----- Seed: " 
1 
selección de cuentos de 
jorge luis"
 
1 
selección de cuentos de 
jorge luis fresar es 
confertable la fegua y la sorronapica de quita y a una con la rescupía la perincia de la incircuncia de la noche de un palabra y jardín de las antes de la de la canda de insumero al especio se otro en inmimirian de la pierta de es la creció el casa de las tres podías pronusciso provemos de la 
scerencia en la espraría lo primera la banda de la cara de la intrabar propónica 
preveraga n


In [7]:
model.save(r'C:\Users\agusm\Documents\borgesSimpleModel.keras')