In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Example training data
words = [
    "mist", "recycle", "initiative", "texture", "retired", "retirement", "woman", "medicine", "ambition", "shark",
    "consulation", "flawed", "equal", "expenditure", "practice", "dinner", "tendency", "thrust", "taste", "blow",
    "disappointment", "arrangement", "seminar", "penetrate", "battle", "harmony", "import", "captain", "prove", "chalk",
    "window", "knowledge", "orange", "beautiful", "computer", "university", "language", "river", "mountain", "forest",
    "music", "painting", "village", "country", "freedom", "teacher", "student", "pencil", "garden", "library",
    "engine", "science", "future", "memory", "danger", "courage", "ocean", "planet", "galaxy", "universe",
    "holiday", "season", "family", "friendship", "happiness", "darkness", "lightning", "shadow", "morning", "evening",
    "midnight", "thunder", "rainbow", "sunset", "sunrise", "calendar", "history", "government", "president", "village",
    "castle", "palace", "kingdom", "fortune", "wisdom", "justice", "reality", "illusion", "mystery", "puzzle",
    "poetry", "fiction", "character", "dialogue", "chapter", "sentence", "paragraph", "dictionary", "philosophy", "biology"
]

syllables = [
    1, 3, 5, 2, 3, 4, 2, 3, 3, 1,
    4, 1, 2, 4, 2, 2, 3, 1, 1, 1,
    4, 3, 3, 3, 2, 3, 2, 2, 1, 1,
    2, 2, 2, 3, 3, 5, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
    2, 2, 2, 3, 2, 2, 2, 2, 3, 3,
    3, 2, 3, 2, 3, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 3, 3, 3, 3, 2,
    2, 2, 2, 2, 2, 2, 4, 3, 3, 2,
    3, 2, 3, 3, 2, 2, 3, 4, 4, 4
]

# Tokenize words at character level
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(words)
sequences = tokenizer.texts_to_sequences(words)

# Pad sequences
maxlen = max(len(w) for w in words)
X = pad_sequences(sequences, maxlen=maxlen, padding='post')

# Define max syllable count (important for classification range)
max_syllables = max(syllables)

# Convert to categorical (one-hot labels)
y = tf.keras.utils.to_categorical(syllables, num_classes=max_syllables + 1)

# Define model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1,
              output_dim=32,
              input_length=maxlen),
    LSTM(64, return_sequences=True),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(max_syllables + 1, activation='softmax')  # classification output
])

# Compile model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train model
model.fit(X, y, epochs=200, verbose=1)

2025-09-09 15:08:37.169008: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-09 15:08:42.648504: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-09 15:08:42.652367: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7726a7067760>

In [2]:
# Test prediction
test_words = ["goat", "hardware", "benefit", "tree", "unit", "sight", "touch", "voter", "network", "party"]
test_seq = tokenizer.texts_to_sequences(test_words)
test_X = pad_sequences(test_seq, maxlen=maxlen, padding='post')
preds = model.predict(test_X)

# Convert softmax to integer syllable predictions
pred_classes = np.argmax(preds, axis=1)

for w, p in zip(test_words, pred_classes):
    print(f"{w} → predicted syllables: {p}")

goat → predicted syllables: 2
hardware → predicted syllables: 2
benefit → predicted syllables: 2
tree → predicted syllables: 2
unit → predicted syllables: 2
sight → predicted syllables: 2
touch → predicted syllables: 1
voter → predicted syllables: 1
network → predicted syllables: 2
party → predicted syllables: 2
