# Baby name generator with considering the sex
##### This model generates baby names using an LSTM neural network. Here we consider the sex of the baby as an additional feature.

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import random
from tensorflow.keras.models import load_model

In [2]:
year_range = range(1900, 2025)
names_data = []

for i in year_range:
    file_path = f'Data/names/yob{i}.txt'
    with open(file_path, 'r') as file:
        names_data.extend(file.read().lower().splitlines())

len(names_data)

2097213

In [3]:
male_names = []
female_names = []

for i in names_data:
    if ',m,' in i:
        name = i.split(',')[0]
        male_names.append(name)
    elif ',f,' in i:
        name = i.split(',')[0]
        female_names.append(name)

print(f"Male names count: {len(male_names)}")
print(f"Female names count: {len(female_names)}")

Male names count: 862990
Female names count: 1234223


In [4]:
# create a mapping of letters to integers
letter_tokens = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5, 'f':6, 'g':7, 'h':8, 'i':9, 'j':10,
                 'k':11, 'l':12, 'm':13, 'n':14, 'o':15, 'p':16, 'q':17, 'r':18, 's':19,
                 't':20, 'u':21, 'v':22, 'w':23, 'x':24, 'y':25, 'z':26, '<end>':27}

total_letters = len(letter_tokens) + 1

print(total_letters)

28


In [5]:
# function to convert a word to a sequence of integers + end token
def word_to_sequence(word):
    return [letter_tokens[char] for char in word if char in letter_tokens]+[27]

word_to_sequence("anushka")

[1, 14, 21, 19, 8, 11, 1, 27]

In [6]:
def sequence_generator(names_list):
    my_input_sequences = []
    for line in names_list:
        #print(line)
        token_list = word_to_sequence(line)
        #print(token_list)
        for i in range(1, len(token_list)):
            my_n_gram_sequence = token_list[:i+1]
            #print(my_n_gram_sequence)
            my_input_sequences.append(my_n_gram_sequence)
            #print(input_sequences)
    return my_input_sequences

input_sequences_male = sequence_generator(male_names)
input_sequences_female = sequence_generator(female_names)

In [7]:
input_sequences_male[1]

[10, 15, 8]

In [8]:
print("Total male sequences before:", len(input_sequences_male))
print("Total sequences before:", len(input_sequences_female))

# Shuffle and keep only a subset of sequences
random.shuffle(input_sequences_male)
random.shuffle(input_sequences_female)

max_samples = 1_000_000  
input_sequences_male_rand = input_sequences_male[:max_samples]
input_sequences_female_rand = input_sequences_female[:max_samples]

print("\nTotal male sequences after:", len(input_sequences_male_rand))
print("Total female sequences after:", len(input_sequences_female_rand))

Total male sequences before: 5171393
Total sequences before: 7775908

Total male sequences after: 1000000
Total female sequences after: 1000000


In [9]:
# pad sequences to have the same length
max_sequence_len_male = max([len(seq) for seq in input_sequences_male_rand])
max_sequence_len_female = max([len(seq) for seq in input_sequences_female_rand])

input_sequences_male_final = np.array(pad_sequences(input_sequences_male_rand, maxlen=max_sequence_len_male, padding='pre'))
input_sequences_female_final = np.array(pad_sequences(input_sequences_female_rand, maxlen=max_sequence_len_female, padding='pre'))

In [10]:
print(f'Max length of male sequences: {max_sequence_len_male}')
print(f'Max length of female sequences: {max_sequence_len_female}')

Max length of male sequences: 16
Max length of female sequences: 16


In [11]:
# create predictors and labels
X_male = input_sequences_male_final[:, :-1]
y_male = input_sequences_male_final[:, -1]

X_female = input_sequences_female_final[:, :-1]
y_female = input_sequences_female_final[:, -1]

In [12]:
# one-hot encode the labels
y_male = np.array(tf.keras.utils.to_categorical(y_male, num_classes=total_letters))
y_female = np.array(tf.keras.utils.to_categorical(y_female, num_classes=total_letters))

In [13]:
# build the model for male names

model_male = Sequential([
    Embedding(total_letters, 100),
    LSTM(150),
    Dropout(0.2),
    Dense(total_letters, activation='softmax')
])

model_male.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model_male.build(input_shape=(None, max_sequence_len_male - 1))
model_male.summary()

2025-12-28 21:44:36.003494: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-12-28 21:44:36.003532: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-12-28 21:44:36.003567: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.92 GB
2025-12-28 21:44:36.003589: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-12-28 21:44:36.003600: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [14]:
# build the model for female names
model_female = Sequential([
    Embedding(total_letters, 100),
    LSTM(150),
    Dropout(0.2),
    Dense(total_letters, activation='softmax')
])

model_female.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model_female.build(input_shape=(None, max_sequence_len_female - 1))
model_female.summary()

In [15]:
# train the male model
history_male = model_male.fit(X_male, y_male, epochs=40, batch_size=256, validation_split=0.1)
model_male.save("baby_name_male_lstm_v1.keras")

Epoch 1/40


2025-12-28 21:44:36.800420: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 44ms/step - accuracy: 0.3326 - loss: 2.1973 - val_accuracy: 0.4325 - val_loss: 1.7914
Epoch 2/40
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 44ms/step - accuracy: 0.4329 - loss: 1.7914 - val_accuracy: 0.4731 - val_loss: 1.6506
Epoch 3/40
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 43ms/step - accuracy: 0.4627 - loss: 1.6844 - val_accuracy: 0.4929 - val_loss: 1.5801
Epoch 4/40
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 44ms/step - accuracy: 0.4803 - loss: 1.6208 - val_accuracy: 0.5050 - val_loss: 1.5401
Epoch 5/40
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 45ms/step - accuracy: 0.4908 - loss: 1.5820 - val_accuracy: 0.5133 - val_loss: 1.5096
Epoch 6/40
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 44ms/step - accuracy: 0.4992 - loss: 1.5533 - val_accuracy: 0.5184 - val_loss: 1.4890
Epoch 7/4

In [16]:
# train the female model
history_female = model_female.fit(X_female, y_female, epochs=50, batch_size=256, validation_split=0.1)
model_female.save("baby_name_female_lstm_v1.keras")

Epoch 1/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 46ms/step - accuracy: 0.3478 - loss: 2.0517 - val_accuracy: 0.4472 - val_loss: 1.7013
Epoch 2/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 47ms/step - accuracy: 0.4439 - loss: 1.6992 - val_accuracy: 0.4780 - val_loss: 1.5948
Epoch 3/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 45ms/step - accuracy: 0.4699 - loss: 1.6131 - val_accuracy: 0.4909 - val_loss: 1.5460
Epoch 4/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 45ms/step - accuracy: 0.4812 - loss: 1.5689 - val_accuracy: 0.5003 - val_loss: 1.5140
Epoch 5/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 44ms/step - accuracy: 0.4900 - loss: 1.5405 - val_accuracy: 0.5048 - val_loss: 1.4953
Epoch 6/50
[1m3516/3516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 44ms/step - accuracy: 0.4961 - loss: 1.5192 - val_accuracy: 0.5058 - val_loss: 1.482

In [17]:
# Required globals (same as before – keep these defined once in the notebook)
PAD_ID = 0
END_ID = letter_tokens['<end>']
index_to_char = {idx: ch for ch, idx in letter_tokens.items()}

# Updated function
def name_generator(first_letters, min_length, max_length, model, max_sequence_len=16):
    name = first_letters.lower()

    while len(name) < max_length:
        # Convert name into token ids (drop autoreadded <end>)
        token_list = word_to_sequence(name)[:-1]

        # Pad for model input
        token_list = pad_sequences(
            [token_list],
            maxlen=max_sequence_len - 1,
            padding='pre',
            value=PAD_ID,
        )

        # Predict next character probs
        preds = model.predict(token_list, verbose=0)[0]

        # If we haven't reached min_length yet → force a real character
        if len(name) < min_length:
            sorted_ids = np.argsort(preds)[::-1]   # highest prob first
            next_id = None
            for idx in sorted_ids:
                if idx not in (PAD_ID, END_ID):
                    next_id = int(idx)
                    break

            # Safety escape if absolutely nothing is valid
            if next_id is None:
                break
        else:
            # Normal prediction behavior
            next_id = int(np.argmax(preds))
            if next_id in (PAD_ID, END_ID):
                break

        # Append predicted character
        next_char = index_to_char[next_id]
        name += next_char

    return name.capitalize()

In [18]:
print(name_generator("jo", 4, 10, model_male))
print(name_generator("an", 5, 10, model_male))
print(name_generator("el", 4, 8, model_female))

Josephus
Antonio
Elisabet
