In [None]:
!pip install kagglehub


In [3]:
import kagglehub
import pandas as pd
import os

# Descargar el dataset
path = kagglehub.dataset_download("kaggle/us-baby-names")
print("Path to dataset files:", path)

# Ruta al archivo principal
file_path = os.path.join(path, "NationalNames.csv")

# Cargar el CSV
df = pd.read_csv(file_path)
print("Total nombres:", len(df))

# Obtener lista única de nombres
names = df['Name'].str.lower().dropna().unique()

# Guardar en un archivo de texto plano
with open("names.txt", "w", encoding="utf-8") as f:
    for name in names:
        f.write(f"{name}\n")

print(f"{len(names)} nombres únicos guardados en names.txt")


Downloading from https://www.kaggle.com/api/v1/datasets/download/kaggle/us-baby-names?dataset_version_number=2...


100%|██████████| 173M/173M [00:05<00:00, 36.1MB/s] 

Extracting files...





Path to dataset files: C:\Users\gerar\.cache\kagglehub\datasets\kaggle\us-baby-names\versions\2
Total nombres: 1825433
93889 nombres únicos guardados en names.txt


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import matplotlib.pyplot as plt

In [5]:
# Cargar nombres
with open("names.txt", "r", encoding='utf-8') as f:
    names = f.read().lower().splitlines()

# Agregar tokens de inicio/fin
names = ['^' + name + '$' for name in names]

# Crear vocabulario de caracteres
chars = sorted(set(''.join(names)))
char2idx = {c:i for i, c in enumerate(chars)}
idx2char = {i:c for i, c in enumerate(chars)}

vocab_size = len(chars)
print("Número de caracteres únicos:", vocab_size)


Número de caracteres únicos: 28


In [6]:
# Convertimos los nombres a secuencias de índices
sequences = []
for name in names:
    sequences.append([char2idx[c] for c in name])

# Crear pares (X, y)
X = []
y = []
for seq in sequences:
    for i in range(1, len(seq)):
        X.append(seq[:i])         # input parcial
        y.append(seq[i])          # siguiente carácter

# Rellenar X para que todas tengan el mismo largo
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, padding='pre')

# One-hot en la salida
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)


In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

input_layer = Input(shape=(X.shape[1],))
x = Embedding(input_dim=vocab_size, output_dim=32)(input_layer)
x = LSTM(128)(x)
output = Dense(vocab_size, activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [11]:
history = model.fit(X, y, epochs=5, batch_size=128)

Epoch 1/5
[1m5525/5525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 21ms/step - accuracy: 0.4075 - loss: 1.8438
Epoch 2/5
[1m5525/5525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 21ms/step - accuracy: 0.4071 - loss: 1.8426
Epoch 3/5
[1m5525/5525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 25ms/step - accuracy: 0.4084 - loss: 1.8389
Epoch 4/5
[1m5525/5525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 23ms/step - accuracy: 0.4069 - loss: 1.8435
Epoch 5/5
[1m5525/5525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 23ms/step - accuracy: 0.4086 - loss: 1.8382


In [14]:
def generate_name(model, start_char='^', max_len=20):
    input_seq = [char2idx[start_char]]
    name = ""

    for _ in range(max_len):
        padded = pad_sequences([input_seq], maxlen=X.shape[1], padding='pre')
        pred = model.predict(padded, verbose=0)[0]
        next_idx = np.random.choice(range(vocab_size), p=pred)
        next_char = idx2char[next_idx]
        if next_char == '$':
            break
        name += next_char
        input_seq.append(next_idx)

    return name.capitalize()


In [15]:
for _ in range(10):
    print(generate_name(model))

Duzzin
Leonise
Aubriannia
Ordonte
Lilly
Maulynn
Berimia
Ayma
Latajah
Getney
