In [1]:
from packages import *

In [2]:
def preprocess(names_df, train=True):
    # Lowercase
    names_df['name'] = names_df['name'].str.lower()
    
    # if name provided has more than 1 word, take 1st word
    names_df['name'] = names_df['name'].apply(lambda x:x.split()[0])

    # Remove accent
    names_df['name'] = names_df['name'].apply(lambda x:unidecode.unidecode(x))
    
    # Remove non-alphabet characters
    names_df['name'] = [
        "".join([
            char
            for char in name
            if char.isalpha()==True
        ])
        for name in names_df['name']
    ]
    
    # drop duplicate rows
    names_df = names_df.drop_duplicates().reset_index(drop=True)
    
    # drop NaN
    names_df = names_df.dropna().reset_index(drop=True)

    # Split individual characters
    names_df['name'] = [list(name) for name in names_df['name']]

    # Pad names with spaces to make all names same length
    name_length = 20
    names_df['name'] = [
        (name + [' ']*name_length)[:name_length] 
        for name in names_df['name']
    ]

    # Encode Characters to unique numbers
    # we encode ‘ ’ (space) to 0, ‘a’ to 1, ‘b’ to 2, and so on
    names_df['name'] = [
        [
            max(0.0, ord(char)-96.0) 
            for char in name
        ]
        for name in names_df['name']
    ]
    
    # Encode Gender to Numbers
    if train:
        names_df['gender'] = [
            0 if gender=='F' else 1
            for gender in names_df['gender']
        ]
    
    return names_df

In [3]:
# load model
pred_model = load_model('models/BiLSTM_GC.h5')

In [4]:
# Input names
names = ['Joe','Kamala']

In [5]:
# Convert to dataframe
pred_df = pd.DataFrame({'name': names})

In [6]:
# Preprocess
pred_df = preprocess(pred_df, train=False)

In [7]:
# Predictions
result = pred_model.predict(np.asarray(pred_df['name'].values.tolist())).squeeze(axis=1)

In [8]:
# Format the output
pred_df['Predicted Gender'] = ['Male' if logit > 0.5 else 'Female' for logit in result]

pred_df['Probability'] = [logit if logit > 0.5 else 1.0 - logit for logit in result]

pred_df['Probability'] = pred_df['Probability'].round(2)

pred_df['name'] = names

pred_df.rename(columns={'name': 'Name'}, inplace=True)

pred_df.drop_duplicates(inplace=True)

pred_df.head()

Unnamed: 0,Name,Predicted Gender,Probability
0,Joe,Male,0.57
1,Kamala,Female,0.99
