In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import random
from tensorflow.keras.models import load_model

In [None]:
# read the data file
with open("Data//names.txt", 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

mytext = mytext.lower()

In [None]:
#how the name data looks like in RAW format
mytext[:100]

'mary\nannie\nanna\nmargaret\nhelen\nelsie\nlucy\ndorothy\nmary\nmargaret\nruth\nannie\nelizabeth\nhelen\nmary\nelsi'

In [None]:
# create a mapping of letters to integers
letter_tokens = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5, 'f':6, 'g':7, 'h':8, 'i':9, 'j':10,
                 'k':11, 'l':12, 'm':13, 'n':14, 'o':15, 'p':16, 'q':17, 'r':18, 's':19,
                 't':20, 'u':21, 'v':22, 'w':23, 'x':24, 'y':25, 'z':26, '<end>':27}

total_letters = len(letter_tokens) + 1
print(total_letters)

28


In [None]:
# function to convert a word to a sequence of integers + end token
def word_to_sequence(word):
    return [letter_tokens[char] for char in word if char in letter_tokens]+[27]

word_to_sequence("anushka")

[1, 14, 21, 19, 8, 11, 1, 27]

In [None]:
# create input sequences and corresponding labels
my_input_sequences = []
for line in mytext.split('\n'):
    #print(line)
    token_list = word_to_sequence(line)
    #print(token_list)
    for i in range(1, len(token_list)):
        my_n_gram_sequence = token_list[:i+1]
        #print(my_n_gram_sequence)
        my_input_sequences.append(my_n_gram_sequence)
        #print(input_sequences)

In [None]:
# example input sequences
my_input_sequences[1]

[13, 1, 18]

In [None]:
print("Total sequences before:", len(my_input_sequences))

# Shuffle and keep only a subset of sequences
random.shuffle(my_input_sequences)
max_samples = 1_000_000  
my_input_sequences_rand = my_input_sequences[:max_samples]

print("Total sequences after:", len(my_input_sequences_rand))

Total sequences before: 6246979
Total sequences after: 1000000


In [None]:
# pad sequences to have the same length
max_sequence_len = max([len(seq) for seq in my_input_sequences_rand])
input_sequences = np.array(pad_sequences(my_input_sequences_rand, maxlen=max_sequence_len, padding='pre'))

In [12]:
print(f'Max length of sequences: {max_sequence_len}')

Max length of sequences: 16


In [None]:
# example padded input sequences
input_sequences[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2, 18, 25,  1],
      dtype=int32)

In [None]:
# create predictors and label
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [15]:
X[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2, 18, 25],
      dtype=int32)

In [16]:
y[1]

1

In [None]:
# one-hot encode the labels
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_letters))
y[1]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
# build the model
model = Sequential()
model.add(Embedding(total_letters, 100))
model.add(LSTM(150))
model.add(Dropout(0.2))
model.add(Dense(total_letters, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model.build(input_shape=(None, max_sequence_len-1))
model.summary()

2025-12-28 17:35:34.751303: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-12-28 17:35:34.751331: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-12-28 17:35:34.751337: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.92 GB
2025-12-28 17:35:34.751357: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-12-28 17:35:34.751369: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
# train the model
history = model.fit(X, y, epochs=50, batch_size=256, validation_split=0.1)

Epoch 1/50


2025-12-28 17:35:43.280849: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 44ms/step - accuracy: 0.3634 - loss: 2.0167 - val_accuracy: 0.5528 - val_loss: 1.3838
Epoch 2/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 43ms/step - accuracy: 0.5437 - loss: 1.3979 - val_accuracy: 0.5992 - val_loss: 1.2191
Epoch 3/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 44ms/step - accuracy: 0.5762 - loss: 1.2790 - val_accuracy: 0.6103 - val_loss: 1.1654
Epoch 4/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 44ms/step - accuracy: 0.5888 - loss: 1.2268 - val_accuracy: 0.6145 - val_loss: 1.1362
Epoch 5/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 44ms/step - accuracy: 0.5954 - loss: 1.1979 - val_accuracy: 0.6212 - val_loss: 1.1154
Epoch 6/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 44ms/step - accuracy: 0.6002 - loss: 1.1780 - val_accuracy: 0.6213 - val_loss: 1.1035
Epoch 7/5

In [None]:
# function to generate names
PAD_ID = 0
END_ID = letter_tokens['<end>']
index_to_char = {idx: ch for ch, idx in letter_tokens.items()}

def name_generator(first_letters, min_length=4, max_length=12):
    name = first_letters.lower()

    while len(name) < max_length:
        token_list = word_to_sequence(name)[:-1]

        token_list = pad_sequences(
            [token_list],
            maxlen=max_sequence_len - 1,
            padding='pre',
            value=PAD_ID,
        )

        preds = model.predict(token_list, verbose=0)[0]

        if len(name) < min_length:
            sorted_ids = np.argsort(preds)[::-1]
            next_id = None
            for idx in sorted_ids:
                if idx not in (PAD_ID, END_ID):
                    next_id = int(idx)
                    break

            if next_id is None:
                break
        else:
            next_id = int(np.argmax(preds))
            if next_id in (PAD_ID, END_ID):
                break

        next_char = index_to_char[next_id]
        name += next_char

    return name.capitalize()

In [48]:
name_generator('Sk', min_length=5, max_length=8)

'Skylar'

In [None]:
# save the model
MODEL_PATH = "baby_name_lstm_v1.keras"

model.save(MODEL_PATH)
print("Model saved to:", MODEL_PATH)

Model saved to: baby_name_lstm_v1.keras
