In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SimpleRNN

from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [7]:
data = pd.read_csv("NationalNames.csv")

In [8]:
data.head()

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746


In [9]:
X, y = data["Name"].values, data["Gender"].values

In [10]:
y = LabelEncoder().fit_transform(y)

In [11]:
import string

vocab = dict(zip(list(string.ascii_lowercase), range(3, 29)))


In [12]:
vocab

{'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

In [18]:
X

array(['Mary', 'Anna', 'Emma', ..., 'Zymiere', 'Zyran', 'Zyrin'],
      dtype=object)

In [22]:
X_mod = []
for name in X:
    gen = [vocab[ch] for ch in name.lower()]
    gen.insert(0, 1)
    gen.append(2)
    X_mod.append(gen)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
...     X_mod, y, test_size=0.33, random_state=42)

In [25]:
X_train[0]

[1, 25, 11, 14, 14, 27, 2]

In [26]:
X_test[0]

[1, 22, 20, 3, 16, 2]

In [27]:
X_train_padded = sequence.pad_sequences(X_train, maxlen=15)
X_test_padded = sequence.pad_sequences(X_test, maxlen=15)

In [28]:
X_train_padded[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  1, 25, 11, 14, 14, 27,  2],
      dtype=int32)

In [29]:
X_test_padded[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 22, 20,  3, 16,  2],
      dtype=int32)

In [30]:
in_layer = Input(shape=(15,))
embedding = Embedding(input_dim=29, output_dim=3)(in_layer)
rnn_layer = SimpleRNN(units=60)(embedding)
out_layer = Dense(1, activation="sigmoid")(rnn_layer)

In [31]:
model = Model(inputs=in_layer, outputs=out_layer)


In [32]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])


In [33]:
model.fit(X_train_padded, y_train, batch_size=10000, epochs=2)

Train on 1223040 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f899c7946d0>

In [34]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 15, 3)             87        
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 60)                3840      
_________________________________________________________________
dense (Dense)                (None, 1)                 61        
Total params: 3,988
Trainable params: 3,988
Non-trainable params: 0
_________________________________________________________________


In [40]:
name = "scarlet"

gen = [vocab[ch] for ch in name.lower()]
gen.insert(0, 1)
gen.append(2)

data_padded = sequence.pad_sequences([gen], maxlen=15)

data_padded

model.predict(data_padded)

array([[0.5524772]], dtype=float32)