In [1]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pickle
import pandas as pd

# Load your dataset
data = pd.read_csv("name_gender_dataset.csv")

# Use relevant columns
data = data[["Name", "Gender"]]

# Encode Gender (Female: 0, Male: 1)
label_encoder = LabelEncoder()
data["Gender"] = label_encoder.fit_transform(data["Gender"])  # Female -> 0, Male -> 1

# Tokenize Names at character level
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data["Name"])
sequences = tokenizer.texts_to_sequences(data["Name"])

# Pad sequences to ensure uniform length
max_length = max(len(name) for name in data["Name"])  # Maximum name length in the dataset
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Save tokenizer for later use in the CLI app
with open("tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)

# Split the data into training, validation, and test sets (80-10-10 split)
X_train, X_temp, y_train, y_temp = train_test_split(
    padded_sequences, data["Gender"], test_size=0.2, random_state=42, stratify=data["Gender"]
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Tokenizer saved and data preprocessing complete!")
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")


Tokenizer saved and data preprocessing complete!
Training set: (117815, 25), Validation set: (14727, 25), Test set: (14727, 25)


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1  # Total unique characters + 1 for padding
embedding_dim = 16  # Size of the character embeddings
lstm_units = 32  # Number of units in the LSTM layer

# Build the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(units=lstm_units, return_sequences=False),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification (0: Female, 1: Male)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,  # Number of epochs
    batch_size=32  # Size of training batches
)

# Save the model for use in the CLI app
model.save("name_gender_model.h5")

print("Model training complete and saved!")




Epoch 1/10
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 14ms/step - accuracy: 0.7005 - loss: 0.5688 - val_accuracy: 0.7788 - val_loss: 0.4806
Epoch 2/10
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 17ms/step - accuracy: 0.7770 - loss: 0.4789 - val_accuracy: 0.7849 - val_loss: 0.4662
Epoch 3/10
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 17ms/step - accuracy: 0.7794 - loss: 0.4735 - val_accuracy: 0.7856 - val_loss: 0.4646
Epoch 4/10
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 15ms/step - accuracy: 0.7831 - loss: 0.4654 - val_accuracy: 0.7900 - val_loss: 0.4596
Epoch 5/10
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 16ms/step - accuracy: 0.7849 - loss: 0.4620 - val_accuracy: 0.7919 - val_loss: 0.4521
Epoch 6/10
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 14ms/step - accuracy: 0.7883 - loss: 0.4546 - val_accuracy: 0.7955 - val_loss: 0.4452
Epoc



Model training complete and saved!
