In [1]:
import string
import pandas as pd
import numpy as np
from pprint import pprint

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [29]:
chars_allowed = [i for i in string.printable if i not in string.ascii_lowercase]
encoder = {e:i for i,e in enumerate(chars_allowed)}
decoder = {i:e for i,e in enumerate(chars_allowed)}
enc_space = len(chars_allowed)

In [30]:
def encode_char(c):
    vec = [0]*enc_space
    vec[encoder[c]] = 1
    return vec

def encode_name(n):
    return np.array([encode_char(i) for i in n])

In [31]:
df_dataset = pd.read_csv("dataset.csv")
df_dataset = df_dataset.fillna('0')
input_string = [str(i[0])+'\t'+str(i[1]) for i in df_dataset[['Lastname', 'Firstname']].values.tolist()]
df_dataset.columns

Index(['Lastname', 'Firstname', 'Fullname', 'Firstname.1', 'Gendername',
       'Incomplete'],
      dtype='object')

In [32]:
y = df_dataset[['Fullname', 'Firstname.1', 'Gendername',
       'Incomplete']].values
input_vector_unpadded = [encode_name(i) for i in input_string]
X = pad_sequences(input_vector_unpadded, padding='post', truncating='post', maxlen=40)

In [33]:
X = X.astype('float32')

In [34]:
X.shape

(80000, 40, 74)

In [35]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(enc_space, batch_size=1, stateful=True)),
    tf.keras.layers.Dense(enc_space, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')]
)

In [36]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.SGD(1e-2),
              metrics=['mean_squared_error', 'accuracy'])

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=52)

In [38]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fa8fc4bae80>

In [39]:
model.evaluate(X_test, y_test)



[0.009937566705048084, 0.0004112022870685905, 0.9995499849319458]

## Save Model

In [40]:
import pickle

In [41]:
with open("encoder.pkl", "wb") as outfile:
    pickle.dump(encoder, outfile)

In [42]:
model.save("saved_text_classifer.h5")