In [1]:
# Train the model in Jupyter Notebook for deployment in c++ later by frugally-deep

In [2]:
# use plainml backend to accerlate keras
# need to install plaidml-keras package and run setup after that
# import os
# os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

In [3]:
import tensorflow.keras as keras
import tensorflow as tf
print(keras.__version__)
#print(tf.__version__)
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

2.4.0


# Prepare the dataset (loading pre-processed data by R)

In [4]:
with open('fullname_2.txt', 'r') as f:
    lines = f.readlines()
lines = [line.replace('\n', '') for line in lines]
import json
lines = [json.loads(line) for line in lines]

In [5]:
# only work on the ethnicity for now, gender later
X = [line[0] for line in lines]
y = [int(line[1][0]) for line in lines]

In [6]:
len(X) == len(y)

True

In [7]:
# import the mapping
df = pd.read_csv('fullname_2_vocab.csv')
df

Unnamed: 0,token,id
0,Sa,1
1,ar,2
2,ra,3
3,ah,4
4,h,5
...,...,...
1490,Bd,1491
1491,W.,1492
1492,Sd,1493
1493,.p,1494


In [8]:
# create the mapping between id and token for later use
id2token = df.set_index('id').to_dict()['token']
token2id = df.set_index('token').to_dict()['id']

In [9]:
token2id

{'Sa': 1,
 'ar': 2,
 'ra': 3,
 'ah': 4,
 'h ': 5,
 ' R': 6,
 'Ra': 7,
 'hi': 8,
 'im': 9,
 'Am': 10,
 'my': 11,
 'y ': 12,
 ' M': 13,
 'Ma': 14,
 'ai': 15,
 ' T': 16,
 'Ts': 17,
 'sa': 18,
 'an': 19,
 'ng': 20,
 'Ne': 21,
 'ee': 22,
 'el': 23,
 'li': 24,
 'ma': 25,
 'a ': 26,
 ' G': 27,
 'Go': 28,
 'op': 29,
 'pa': 30,
 'al': 31,
 'ri': 32,
 'ia': 33,
 ' L': 34,
 'Lu': 35,
 'un': 36,
 'na': 37,
 'Hu': 38,
 'uy': 39,
 'ye': 40,
 'en': 41,
 'n ': 42,
 'Ta': 43,
 'Al': 44,
 'is': 45,
 'sh': 46,
 'ha': 47,
 ' S': 48,
 'Sh': 49,
 'he': 50,
 'et': 51,
 'th': 52,
 'ie': 53,
 'tt': 54,
 'ta': 55,
 ' D': 56,
 'De': 57,
 'la': 58,
 'ap': 59,
 'az': 60,
 'ro': 61,
 'on': 62,
 'e ': 63,
 'Le': 64,
 'eo': 65,
 'Ga': 66,
 'ay': 67,
 'ya': 68,
 'at': 69,
 'hr': 70,
 'i ': 71,
 'Su': 72,
 'nd': 73,
 'da': 74,
 'aj': 75,
 'ja': 76,
 'us': 77,
 'ak': 78,
 'ka': 79,
 'am': 80,
 'md': 81,
 'as': 82,
 'ss': 83,
 'Re': 84,
 'eh': 85,
 ' A': 86,
 'Ak': 87,
 'kt': 88,
 'te': 89,
 'er': 90,
 'Us': 91,
 'sm': 9

In [10]:
# convert the input (X) from tokens to ids
X = [[token2id[i] for i in each] for each in X]

In [11]:
# convert the output (y) from 2-5 to 0-3
y = [i-2 for i in y]

In [12]:
# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model

In [13]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM, Bidirectional

num_words = len(id2token)+1
feature_len = 25 # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

669644 train sequences
167412 test sequences
Pad sequences (samples x time)
X_train shape: (669644, 25)
X_test shape: (167412, 25)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (669644, 4)
y_test shape: (167412, 4)


In [14]:
# build the model and run hyperparameter tuning
# using slightly older version of tensorflow and keras-tuner
import kerastuner as kt

def model_builder(hp):
    print('Build model...')

    model = Sequential()
    
    # choose the depth of embedding
    hp_emdb_depth = hp.Int('emdb_depth', min_value=32, max_value=128, step=32)
    model.add(Embedding(num_words, hp_emdb_depth, input_length=feature_len))
    
    hp_lstm_depth = hp.Int('lstm_depth', min_value=32, max_value=128, step=32)
    # model.add(LSTM(hp_lstm_depth, dropout=0.2, recurrent_dropout=0.2))
    # try out bi-directional LSTM
    model.add(Bidirectional(LSTM(hp_lstm_depth, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), 
                            input_shape=(feature_len, hp_lstm_depth)))
    model.add(Bidirectional(LSTM(hp_lstm_depth, dropout=0.2, recurrent_dropout=0.2)))
    
    model.add(Dense(num_classes, activation='softmax'))
    
    # choose between learning rates
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]) 
    model.compile(loss='categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  metrics=['accuracy'])
    return model

# init the tuner
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='tmp_bilstm',
                     project_name='rethnicity')
# early stopping
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

INFO:tensorflow:Reloading Oracle from existing project tmp_bilstm/rethnicity/oracle.json
Build model...


In [15]:
tuner.search(X_train, y_train, epochs=50, validation_split=0.4, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 18 Complete [02h 02m 47s]
val_accuracy: 0.7612167596817017

Best val_accuracy So Far: 0.7649463415145874
Total elapsed time: 14h 47m 06s

Search: Running Trial #19

Hyperparameter    |Value             |Best Value So Far 
emdb_depth        |96                |128               
lstm_depth        |96                |128               
learning_rate     |0.01              |0.001             
tuner/epochs      |4                 |10                
tuner/initial_e...|0                 |4                 
tuner/bracket     |1                 |2                 
tuner/round       |0                 |2                 

Build model...
Epoch 1/4
Epoch 2/4
Epoch 3/4

KeyboardInterrupt: 

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
# this is to find the best epoch value
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=50, validation_split=0.4)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
# build the model with best parameters + best epoch
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_split=0.4)

In [None]:
# run the model on test data to see the accuracy
eval_result = hypermodel.evaluate(X_test, y_test)
print("[test loss, test accuracy]:", eval_result)

In [None]:
# TODO: check the metric of performance
from sklearn.metrics import classification_report

y_pred = hypermodel.predict(X_test, batch_size=32, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(np.argmax(y_test, axis=1), y_pred_bool))

In [None]:
# ethnicity code reference
# 2: asian or pacific islander
# 3: black
# 4: hispanic
# 5: white
# we also need to substract 2 from the codes to get references to the predicted result

In [None]:
# TODO: save the model and then load it into frugally-deep later

In [None]:
hypermodel.save('fullname_2_ethnicity_bilstm.h5', include_optimizer=False)

In [None]:
hypermodel.summary()

In [None]:
hypermodel.get_config()

In [None]:
hypermodel.optimizer

In [None]:
from keras import backend as K
K.eval(hypermodel.optimizer.lr)