## Objective
通过姓氏来预测性别


#### 源文章
https://towardsdatascience.com/choosing-the-right-hyperparameters-for-a-simple-lstm-using-keras-f8e9ed76f046

#### 源代码
https://github.com/R4h4/Firstname_gender_prediction/blob/master/Article_Gender_Prediction.ipynb

In [1]:
import pandas as pd
import numpy as np
from numpy import array
from numpy import argmax
from keras.layers.core import Dense, Activation, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Prepare data

In [10]:
# load the name.csv
# filepath = 'd:/AS_Data/temp/name_test.csv'
filepath = 'name_test.csv'
max_rows = 500000 # Reduction due to memory limitations

df = (pd.read_csv(filepath, usecols=['first_name', 'gender'])
        .dropna(subset=['first_name', 'gender'])
        .assign(first_name = lambda x: x.first_name.str.strip())
        .head(max_rows))

# In the case of a middle name, we will simply use the first name only
df['first_firstname'] = df['first_name'].apply(lambda x: str(x).split(' ', 1)[0])

# Sometimes people only but the first letter of their name into the field, so we drop all name where len <3
df.drop(df[df['first_firstname'].str.len() < 3].index, inplace=True)


In [12]:
# Preprocessing the data
# convert every (first) name into a vector using One-Hot Encoding

# Parameters
predictor_col = 'first_firstname'
result_col = 'gender'

accepted_chars = 'abcdefghijklmnopqrstuvwxyzöäü-'

word_vec_length = min(df[predictor_col].apply(len).max(), 25) 
#Length of the input vector

char_vec_length = len(accepted_chars) 
#Length of the character vector

output_labels = 2 #Number of output labels

print(f"The input vector will have the shape {word_vec_length}x{char_vec_length}.")
#Out: The input vector will have the shape 23x30.

# Define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(accepted_chars))
int_to_char = dict((i, c) for i, c in enumerate(accepted_chars))
#enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，
#同时列出数据和数据下标
#eg. enumerate(sequence, [start=0])

The input vector will have the shape 15x30.


In [30]:
#test
# char_to_int
# int_to_char
# for i,c in enumerate(accepted_chars):
#     print(i,c)

In [None]:
# Removes all non accepted characters
# 全部字母转为小写
def normalize(line):
    return [c.lower() for c in line if c.lower() in accepted_chars]

In [None]:
# Returns a list of n lists with n = word_vec_length
# OneHot编码姓名
def name_encoding(name):

    # Encode input data to int, e.g. a->1, z->26
    integer_encoded = [char_to_int[char] for i, char in enumerate(name) if i < word_vec_length]
    
    # Start one-hot-encoding
    onehot_encoded = list()
    
    for value in integer_encoded:
        # create a list of n zeros, where n is equal to the number of accepted characters
        letter = [0 for _ in range(char_vec_length)]
        letter[value] = 1
        onehot_encoded.append(letter)
        
    # Fill up list to the max length. Lists need do have equal length to be able to convert it into an array
    for _ in range(word_vec_length - len(name)):
        onehot_encoded.append([0 for _ in range(char_vec_length)])
        
    return onehot_encoded

In [None]:
# Encode the output labels
# OneHot编码标签：男性为[1,0], 女性为[0,1]
def lable_encoding(gender_series):
    labels = np.empty((0, 2))
    for i in gender_series:
        if i == 'M':
            labels = np.append(labels, [[1,0]], axis=0)
        else:
            labels = np.append(labels, [[0,1]], axis=0)
    return labels

In [11]:
    
# Split dataset in 60% train, 20% test and 20% validation
train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

# Convert both the input names as well as the output lables into the discussed machine readable vector format
train_x =  np.asarray([np.asarray(name_encoding(normalize(name))) for name in train[predictor_col]])
train_y = lable_encoding(train.gender)

validate_x = np.asarray([name_encoding(normalize(name)) for name in validate[predictor_col]])
validate_y = lable_encoding(validate.gender)

test_x = np.asarray([name_encoding(normalize(name)) for name in test[predictor_col]])
test_y = lable_encoding(test.gender)

The input vector will have the shape 15x30.


## Built Neural Network

In [31]:
hidden_nodes = int(2/3 * (word_vec_length * char_vec_length))
print(f"The number of hidden nodes is {hidden_nodes}.")

The number of hidden nodes is 300.


### Build model

In [35]:
print('Building the model ...')
model = Sequential()
model.add(LSTM(hidden_nodes, return_sequences=False, input_shape=(word_vec_length, char_vec_length)))
model.add(Dropout(0.2))
#Dropout 层，它会在训练阶段忽略随机选定的某些神经元的输出，以此减轻单独神经元对某些特定权重的敏感性，
#这将在一定程度上防止模型出现过拟合的现象
#通常将 deopout_rate 设置为 20%
model.add(Dense(units=output_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])


Building the model ...


### Train model

In [38]:
batch_size = 1000
model.fit(train_x, train_y, 
          batch_size=batch_size, 
          epochs =10, 
          validation_data=(validate_x, validate_y))

Train on 517 samples, validate on 173 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x117a41be0>

In [44]:
validate['predicted_gender'] = ['M' if prediction[0] > prediction[1] else 'F' for prediction in model.predict(validate_x)]

# 预测错误的序号
validate[validate['gender']!=validate['predicted_gender']].head()

Unnamed: 0,first_name,gender,first_firstname,predicted_gender
290,Addicks,M,Addicks,F
371,Ebel,M,Ebel,F
374,Eberhardt,M,Eberhardt,F
754,Stoiber,M,Stoiber,F
452,Engelhardt,M,Engelhardt,F
