In [8]:
import tensorflow as tf
import numpy as np
import os
import time

# Load the cleaned text
path_to_file = 'voynich_super_clean.txt'
text = open(path_to_file, 'r', encoding='utf-8').read()

# Create the vocabulary: the set of all unique characters in the text
vocab = sorted(set(text))
print(f'The text has {len(vocab)} unique characters')

# Create maps to convert characters to numbers and vice versa
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Vectorize the text: transform the entire text into a sequence of integers
text_as_int = np.array([char2idx[c] for c in text])

print('\n--- Mapping Example ---')
print(f'{text[:15]} ----> {text_as_int[:15]}')

The text has 27 unique characters

--- Mapping Example ---
fachys ykal ar  ----> [ 7  2  4  9 25 20  1 25 12  2 13  1  2 19  1]


In [9]:
# Set the length of character sequences
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create the training dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# Transform the dataset into sequences of 101 characters
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

# Function to create input -> target pairs
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# Apply the function to all sequences
dataset = sequences.map(split_input_target)

# Shuffle the dataset and create batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

print('\n--- Dataset Structure ---')
print(dataset)


--- Dataset Structure ---
<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>


In [10]:
# Length of the vocabulary (number of unique characters)
vocab_size = len(vocab)

# Dimension of the embedding (how "rich" the vector for each character is)
embedding_dim = 256

# Number of neurons in the LSTM layer
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        # 1. Embedding Layer: transforms numbers into vectors
        tf.keras.layers.Embedding(vocab_size, embedding_dim),

        # 2. LSTM Layer: the core that learns the sequences
        tf.keras.layers.LSTM(rnn_units,
                             return_sequences=True,
                             stateful=True,
                             recurrent_initializer='glorot_uniform'),

        # 3. Output Layer: produces probabilities for the next character
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

# Show a summary of the model's architecture
model.summary()

In [11]:
# Define the loss function and the optimizer
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

# Configuration to save model "checkpoints" during training
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

# Set the number of epochs (how many times the model reads the entire text)
EPOCHS = 20

print("\n--- 🚀 Starting Training ---")
# This process can take 30-60 minutes or more
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
print("--- ✅ Training Complete ---")


--- 🚀 Starting Training ---
Epoch 1/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 69ms/step - loss: 2.0545
Epoch 2/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step - loss: 1.3236
Epoch 3/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 71ms/step - loss: 1.2804
Epoch 4/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 72ms/step - loss: 1.2500
Epoch 5/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 72ms/step - loss: 1.2356
Epoch 6/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 74ms/step - loss: 1.2167
Epoch 7/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 75ms/step - loss: 1.1910
Epoch 8/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 75ms/step - loss: 1.1661
Epoch 9/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 73ms/step - loss: 1.1326
Epoch 10/20
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━

In [12]:
# Rebuild the model with batch_size=1
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

# "Build" the model BEFORE loading the weights
model.build(tf.TensorShape([1, None]))

# Specify the exact name of the last checkpoint
last_checkpoint_file = os.path.join(checkpoint_dir, f"ckpt_{EPOCHS}.weights.h5")

# Now that the model is built, we can load the weights
model.load_weights(last_checkpoint_file)

print(f"✅ Weights loaded from '{last_checkpoint_file}'. Model is ready for generation.")

✅ Weights loaded from './training_checkpoints/ckpt_20.weights.h5'. Model is ready for generation.


In [13]:
# Final Cell: Generation and Saving of Texts

def generate_text(model, start_string, num_generate=50000, temp=1.0):
    """Generates text using the trained model."""
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    temperature = temp

    # The 'model.reset_states()' line is no longer needed in recent TF versions

    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

# --- EXECUTION BLOCK FOR THE NEW WORK ---

print("--- 🤖 Starting Generation with the 'Super Clean' Model ---")

start_seed = "daiin "

# --- 1. Normal Temperature (0.7) ---
print("Generating text at normal temperature (0.7)...")
generated_text_normal = generate_text(model, start_string=start_seed, temp=0.7)

with open("generated_clean_normal_temp.txt", "w", encoding="utf-8") as f:
    f.write(generated_text_normal)
print("✅ Text at temp 0.7 saved to 'generated_clean_normal_temp.txt'.")

# --- 2. Low Temperature (0.5) ---
print("\nGenerating text at low temperature (0.5)...")
generated_text_low = generate_text(model, start_string=start_seed, temp=0.5)

with open("generated_clean_low_temp.txt", "w", encoding="utf-8") as f:
    f.write(generated_text_low)
print("✅ Text at temp 0.5 saved to 'generated_clean_low_temp.txt'.")

# --- 3. High Temperature (1.2) ---
print("\nGenerating text at high temperature (1.2)...")
generated_text_high = generate_text(model, start_string=start_seed, temp=1.2)

with open("generated_clean_high_temp.txt", "w", encoding="utf-8") as f:
    f.write(generated_text_high)
print("✅ Text at temp 1.2 saved to 'generated_clean_high_temp.txt'.")

print("\n--- All generations are complete. You can now download the files and analyze them. ---")

--- 🤖 Starting Generation with the 'Super Clean' Model ---
Generating text at normal temperature (0.7)...
✅ Text at temp 0.7 saved to 'generated_clean_normal_temp.txt'.

Generating text at low temperature (0.5)...
✅ Text at temp 0.5 saved to 'generated_clean_low_temp.txt'.

Generating text at high temperature (1.2)...
✅ Text at temp 1.2 saved to 'generated_clean_high_temp.txt'.

--- All generations are complete. You can now download the files and analyze them. ---
