In [1]:
import numpy as np
from tensorflow.keras.utils import to_categorical
import os

In [2]:
filename = os.path.join("C:/python-ml/datasets/shakesphere","shakesphere.txt")

In [3]:
try:
    with open(filename, 'r', encoding='utf-8') as f:
        raw_text = f.read()
except FileNotFoundError:
    exit()

In [4]:
raw_text = raw_text.lower()
# Transformed the entire corpus to lower case

In [5]:
print(len(raw_text))

5447675


In [6]:
# Creating the vocabulary

chars = sorted(list(set(raw_text)))
n_chars = len(chars)

print(f"Total Vocabulary {n_chars}")
print(''.join(chars))

Total Vocabulary 58

 !"&'(),-.0123456789:;<>?[]_`abcdefghijklmnopqrstuvwxyz|}


In [7]:
# Integer Character Mapping
char_to_int = {char: i for i, char in enumerate(chars)}
# Character Integer Mapping
int_to_char = {i : char for i, char in enumerate(chars)}

print(f"Mappings {char_to_int}")

# Why?
'''
 Vocabulary defines every possible input and output to the model
 The char_to_int dictionary is crucial for converting the text data into numerical sequences for training.
 The int_to_char will be used to convert the numbers back to text
'''

Mappings {'\n': 0, ' ': 1, '!': 2, '"': 3, '&': 4, "'": 5, '(': 6, ')': 7, ',': 8, '-': 9, '.': 10, '0': 11, '1': 12, '2': 13, '3': 14, '4': 15, '5': 16, '6': 17, '7': 18, '8': 19, '9': 20, ':': 21, ';': 22, '<': 23, '>': 24, '?': 25, '[': 26, ']': 27, '_': 28, '`': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55, '|': 56, '}': 57}


'\n Vocabulary defines every possible input and output to the model\n The char_to_int dictionary is crucial for converting the text data into numerical sequences for training.\n The int_to_char will be used to convert the numbers back to text\n'

In [8]:
# Generating the training data

# First encoding the entire corpus into numbers

transformed_text = [char_to_int[char] for char in raw_text]

In [9]:
# Sequence Length for the model

seq_len = 100

dataX = []
dataY = []

for i in range(0, len(transformed_text) - seq_len, 1):
    # Sequence 0 to 100
    seq_in = transformed_text[i: i+seq_len]
    # 101th word
    seq_out = transformed_text[i+seq_len]

    dataX.append(seq_in)
    dataY.append(seq_out)

In [11]:
n_patterns = len(dataX)
print(f"Total training patterns (sequences) created: {n_patterns}")

Total training patterns (sequences) created: 5447575


In [10]:
X = np.array(dataX)
Y = np.array(dataY)

In [31]:
# Converted to numpy arrays for efficient computation

In [16]:
import h5py
import numpy as np

def save_batch_to_hdf5(filename, X_batch, y_batch):
    """Saves a batch of data to a single .h5 file."""
    with h5py.File(filename, 'w') as hf:
        hf.create_dataset('X', data=X_batch)
        hf.create_dataset('y', data=y_batch)

In [19]:
import os

# Define the path for your processed data
directory_path = "processed_data"

# Create the directory if it doesn't already exist
os.makedirs(directory_path, exist_ok=True)

In [21]:
# OFFLINE PRE PROCESSING

batch_size = 1024
num_batches = len(X)// batch_size

for i in range(200):
    start = i*batch_size
    end = (i+1)*batch_size
    X_batch_int = X[start:end]
    Y_batch_int = Y[start:end]
    X_batch_processed = to_categorical(X_batch_int, num_classes=n_chars)
    y_batch_processed = to_categorical(Y_batch_int, num_classes=n_chars)
    save_batch_to_hdf5(f'processed_data/batch_{i}.h5', X_batch_processed, y_batch_processed)

In [24]:
# Creating the RNN model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

model = Sequential()

# 100 sequences, each of one hot vectors
model.add(SimpleRNN(256, input_shape=(seq_len, n_chars)))
model.add(Dense(n_chars, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


In [22]:
import tensorflow as tf
list_of_files = tf.io.gfile.glob('processed_data/*.h5')

dataset = tf.data.Dataset.from_tensor_slices(list_of_files)

dataset = dataset.shuffle(buffer_size=100).batch(128).prefetch(tf.data.AUTOTUNE)

In [28]:
def read_h5_file(filepath):
    """Opens an h5 file and yields its X and y data."""
    with h5py.File(filepath, 'r') as hf:
        X_batch = np.array(hf['X'])
        y_batch = np.array(hf['y'])
    return X_batch, y_batch

In [29]:
# --- Training Parameters ---
batch_size = 128 # The size of the batches you want to feed the model

# 1. Create a dataset of the filenames
dataset = tf.data.Dataset.from_tensor_slices(file_list)

# 2. For each filename, read the actual data from the file
#    - from_generator creates a dataset from a Python generator.
#    - It's a clean way to load data that doesn't fit in memory.
dataset = dataset.interleave(
    lambda filepath: tf.data.Dataset.from_generator(
        lambda: read_h5_file(filepath.numpy()),
        output_signature=(
            tf.TensorSpec(shape=(None, seq_len, n_chars), dtype=tf.float32),
            tf.TensorSpec(shape=(None, n_chars), dtype=tf.float32)
        )
    ),
    cycle_length=tf.data.AUTOTUNE,
    num_parallel_calls=tf.data.AUTOTUNE
)


# 3. Shuffle, Batch, and Prefetch for optimal performance
#    - .shuffle() randomizes the order of the sequences to improve learning.
#    - .batch() groups the individual sequences into batches.
#    - .prefetch() prepares the next batch while the GPU is busy with the current one.
dataset = dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)


NameError: name 'file_list' is not defined

In [35]:
import tensorflow as tf
import h5py
import numpy as np
import random

# --- 1. Get the List of Files (Same as before) ---
file_list = tf.io.gfile.glob('processed_data/*.h5')
if not file_list:
    print("Error: No .h5 files found in 'processed_data'.")
else:
    # Pick the first file to inspect
    filepath_to_check = file_list[0]
    print(f"--- Checking file: {filepath_to_check} ---")

    try:
        with h5py.File(filepath_to_check, 'r') as hf:
            # Check if the 'X' and 'y' keys exist
            if 'X' not in hf or 'y' not in hf:
                print("Error: File is missing 'X' or 'y' datasets.")
            else:
                X_data = hf['X']
                y_data = hf['y']
                print(f"Found 'X' dataset with shape: {X_data.shape}")
                print(f"Found 'y' dataset with shape: {y_data.shape}")
                print("\nFile appears to be valid.")

    except Exception as e:
        print(f"\nAn error occurred while reading the file: {e}")

random.shuffle(file_list)

# --- 2. Define the Parsing Function with the Correct Wrapper ---

# This is the required "blueprint" for the function's output
output_types = (tf.float32, tf.float32)

@tf.py_function(Tout=output_types)
def parse_h5_file(filepath):
    """
    Reads the X and y tensors from a single HDF5 file.
    The Tout argument above tells TensorFlow what dtypes to expect.
    """
    # filepath is a tf.Tensor, so we need .numpy() to get the string value
    with h5py.File(filepath.numpy().decode(), 'r') as hf:
        X_batch = np.array(hf['X'], dtype=np.float32)
        y_batch = np.array(hf['y'], dtype=np.float32)
    return X_batch, y_batch

# --- 3. Build the Simplified tf.data Pipeline ---
FULL_BATCH_SIZE = 128 # The full batch size you used in pre-processing

output_signature = (
    tf.TensorSpec(shape=(None, seq_len, n_chars), dtype=tf.float32),
    tf.TensorSpec(shape=(None, n_chars), dtype=tf.float32)
)



# Create the dataset from our Python generator
dataset = tf.data.Dataset.from_generator(
    lambda: h5_file_generator(file_list),
    output_signature=output_signature
)

# Filter out any batches that are not the full size
dataset = dataset.filter(lambda x, y: tf.shape(x)[0] == FULL_BATCH_SIZE)

# Prefetch the next batch
dataset = dataset.prefetch(tf.data.AUTOTUNE)


# --- 4. Train the Model ---
print("Starting training...")
model.fit(
    dataset,
    epochs=20
)

--- Checking file: processed_data\batch_0.h5 ---
Found 'X' dataset with shape: (1024, 100, 58)
Found 'y' dataset with shape: (1024, 58)

File appears to be valid.
Starting training...
Epoch 1/20


ValueError: Attr 'Toutput_types' of 'OptionalFromValue' Op passed list of length 0 less than minimum 1.