In [1]:
import pandas as pd

male_names = pd.read_csv('data/names/male.txt', header=None)[0].str.lower()
female_names = pd.read_csv('data/names/female.txt', header=None)[0].str.lower()

df_m = pd.DataFrame({'name': male_names, 'gender': 0})
df_f = pd.DataFrame({'name': female_names, 'gender': 1})

df = pd.concat([df_m, df_f]).sample(frac=1).reset_index(drop=True)
print(df.head())

       name  gender
0  petronia       1
1    bambie       1
2      hali       1
3      cain       0
4     hanni       1


In [38]:
import numpy as np

# Load names from the dataframe
all_names = df['name'].tolist()

# In your Step‑2 vocab build, include gender tokens:
PAD = "<PAD>"
START = "<S>"
END = "<E>"
GENDER_M = "<M>"
GENDER_F = "<F>"

# Build vocab from names as before, then prepend gender tokens:
chars = sorted(set("".join(all_names)))
vocab = [PAD, START, END, GENDER_M, GENDER_F] + chars

char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = {i: c for c, i in char2idx.items()}


vocab_size = len(vocab)
print(f"🧠 Vocabulary size: {vocab_size}")
print(f"🔤 Example char2idx: {dict(list(char2idx.items())[:10])}")


🧠 Vocabulary size: 34
🔤 Example char2idx: {'<PAD>': 0, '<S>': 1, '<E>': 2, '<M>': 3, '<F>': 4, ' ': 5, "'": 6, '-': 7, 'a': 8, 'b': 9}


In [39]:
def encode_name(name, gender_label):
    """
    name: string, e.g. "maria"
    gender_label: 0 (male) or 1 (female)
    """
    gender_token = GENDER_M if gender_label == 0 else GENDER_F
    seq = [gender_token, START] + list(name) + [END]
    return [char2idx[c] for c in seq]

# Re-prepare encoded_names & sequences:
encoded = [encode_name(n, g) for n, g in zip(df['name'], df['gender'])]
input_seqs = [seq[:-1] for seq in encoded]
target_seqs = [seq[1:] for seq in encoded]

# Pad them as before using char2idx[PAD]
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = max(len(seq) for seq in encoded)
X = pad_sequences(input_seqs, maxlen=max_len-1, padding='post', 
                  value=char2idx[PAD])
y = pad_sequences(target_seqs, maxlen=max_len-1, padding='post', 
                  value=char2idx[PAD])


# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

print(f"✅ Input shape: {X.shape}")
print(f"🎯 Target shape: {y.shape}")


✅ Input shape: (7944, 17)
🎯 Target shape: (7944, 17)


In [40]:
# Helper function to decode index sequence back to string
def decode_sequence(seq):
    return ''.join([idx2char.get(idx, '?') for idx in seq if idx != char2idx[PAD]])

# Show a few sample input → output name training pairs
for i in range(5):
    input_seq = decode_sequence(X[i])
    target_seq = decode_sequence(y[i])
    print(f"{i+1}. Input : {input_seq}")
    print(f"   Target: {target_seq}\n")

1. Input : <F><S>petronia
   Target: <S>petronia<E>

2. Input : <F><S>bambie
   Target: <S>bambie<E>

3. Input : <F><S>hali
   Target: <S>hali<E>

4. Input : <M><S>cain
   Target: <S>cain<E>

5. Input : <F><S>hanni
   Target: <S>hanni<E>



In [41]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed

# Define model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    LSTM(units=128, return_sequences=True),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

# Compile
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [42]:
# Add callbacks to save time
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(patience=2)
]

model.fit(
    X, y,
    batch_size=64,
    epochs=30,
    validation_split=0.1,
    callbacks=callbacks
)

Epoch 1/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 22ms/step - accuracy: 0.5125 - loss: 2.1321 - val_accuracy: 0.6347 - val_loss: 1.3484 - learning_rate: 0.0010
Epoch 2/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.6669 - loss: 1.2913 - val_accuracy: 0.6894 - val_loss: 1.0770 - learning_rate: 0.0010
Epoch 3/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.6917 - loss: 1.0458 - val_accuracy: 0.6944 - val_loss: 0.9951 - learning_rate: 0.0010
Epoch 4/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.6954 - loss: 0.9865 - val_accuracy: 0.6977 - val_loss: 0.9685 - learning_rate: 0.0010
Epoch 5/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.7009 - loss: 0.9585 - val_accuracy: 0.7023 - val_loss: 0.9440 - learning_rate: 0.0010
Epoch 6/30
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x277f8bbafe0>

In [43]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sampling helper (unchanged)
def sample_from_probs(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds + 1e-10) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

# Corrected generate_name
def generate_name(model, char2idx, idx2char,
                  gender_label, temperature=1.0, max_len=None):
    if max_len is None:
        max_len = X.shape[1]

    # Start sequence with gender token + START
    gender_token = GENDER_M if gender_label == 0 else GENDER_F
    input_seq = [char2idx[gender_token], char2idx[START]]
    name = ''

    for _ in range(max_len-2):  # we've already got 2 tokens
        padded = pad_sequences([input_seq],
                               maxlen=max_len,
                               padding='post',
                               value=char2idx[PAD])
        preds = model.predict(padded, verbose=0)[0, len(input_seq)-1]
        next_idx = sample_from_probs(preds, temperature)
        next_char = idx2char[next_idx]

        if next_char == END:
            break
        name += next_char
        input_seq.append(next_idx)

    return name.capitalize()

def generate_names(model, char2idx, idx2char, gender_label, n=10, temperature=1.0):
    return [generate_name(model, char2idx, idx2char, gender_label, temperature)
            for _ in range(n)]


In [44]:
# Generate 5 male names
male_samples = generate_names(model, char2idx, idx2char, gender_label=0, n=5, temperature=0.8)
print("Male:", male_samples)

# Generate 5 female names
female_samples = generate_names(model, char2idx, idx2char, gender_label=1, n=5, temperature=0.8)
print("Female:", female_samples)


Male: ['Riddy', 'Tabrey', 'Gordon', 'Lean', 'Abnaviah']
Female: ['Sandre', 'Bettee', 'Kaila', 'Andrise', 'Susa']


In [45]:
# 📦 Save the trained model
model.save("models/namegen_lstm.h5")




In [46]:
import pickle

with open("models/char_mappings.pkl", "wb") as f:
    pickle.dump({'char2idx': char2idx, 'idx2char': idx2char}, f)
