This notebook contains the basic function to use ML model trained by sklearn to predict gender and age interval.

In [1]:
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import warnings
import numpy as np
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data preprocess

In [2]:
import pandas as pd
name_df = pd.read_csv(r"D:\code\Behind_Names\Taiwan_Name_Experiment\name_data\name_sample.csv", encoding='big5')
name_df["FirstName"] = name_df["FirstName"].apply(lambda x: x if len(x) == 2 else f" {x}")
name_df

Unnamed: 0,id,name_1,name_2,name_3,LastName,FirstName
0,郭*叡,郭,*,叡,郭,叡
1,葉*宣,葉,*,宣,葉,宣
2,許*凡,許,*,凡,許,凡
3,許*欣,許,*,欣,許,欣
4,賴*延,賴,*,延,賴,延
5,方平和,方,平,和,方,平和
6,蕭清彥,蕭,清,彥,蕭,清彥
7,王建邦,王,建,邦,王,建邦
8,陳依萱,陳,依,宣,陳,依萱
9,張家豪,張,家,豪,張,家豪


In [3]:
import pandas as pd
from gensim.models import word2vec
from name_module.preprocess import (get_first_name, get_last_name,
                                    turn_all_word_in_w2v_model_to_dataframe,
                                    add_w2v_feature, load_saved_files, add_word_vector,
                                    add_phonetic_feature, get_vowel_consonant, get_x_feature,
                                    add_fortune_map_feature, add_radical_feature, add_zodiac_feature)

# load saved files
print("Add W2V feature")
w2v_model = word2vec.Word2Vec.load("./w2v_data/wiki_moe_100_model.bin")
all_character_in_name, moe_data_dict, synonyms_dict, consonants, vowels, special_word_dict = load_saved_files()

w2v_mean = turn_all_word_in_w2v_model_to_dataframe(w2v_model).mean()
name_df, w2v_feature = add_w2v_feature(w2v_model, name_df, w2v_vector_number=100, synonyms=synonyms_dict, w2v_mean=w2v_mean)


Add W2V feature
w2v_feature len 200


## Gender predict (w2v feature)

In [4]:
import numpy as np
import pickle

with open(r"D:\code\Behind_Names\Taiwan_Name_Experiment\TrainedModel\W2V_feature_gnder_RFC_model.pkl", "rb") as f:
    W_gnder_RFC_model = pickle.load(f)

In [5]:

predicts = W_gnder_RFC_model.predict_proba(name_df[w2v_feature])
predicts = np.round(predicts, 1)
name_df["Male_prob"] = predicts[:, 1]
name_df["Female_prob"] = predicts[:, 0]
name_df["Gender"] = name_df[["Male_prob", "Female_prob"]].apply(lambda row: "Male" if row["Male_prob"] > row["Female_prob"] else "Female", axis=1)
name_df.head()

Unnamed: 0,id,name_1,name_2,name_3,LastName,FirstName,FN1_wv_0,FN2_wv_0,FN1_wv_1,FN2_wv_1,...,FN2_wv_96,FN1_wv_97,FN2_wv_97,FN1_wv_98,FN2_wv_98,FN1_wv_99,FN2_wv_99,Male_prob,Female_prob,Gender
0,郭*叡,郭,*,叡,郭,叡,-0.248495,-1.967829,0.290753,2.909606,...,1.758259,-0.098471,-0.553964,0.024966,-0.022386,0.090483,-2.088785,1.0,0.0,Male
1,葉*宣,葉,*,宣,葉,宣,-0.248495,2.577486,0.290753,10.592291,...,-1.239423,-0.098471,-4.751159,0.024966,-1.369206,0.090483,-9.444345,0.0,1.0,Female
2,許*凡,許,*,凡,許,凡,-0.248495,-0.344027,0.290753,-1.717562,...,-2.241606,-0.098471,0.578485,0.024966,-1.428055,0.090483,-4.951863,0.0,1.0,Female
3,許*欣,許,*,欣,許,欣,-0.248495,-2.39828,0.290753,1.651978,...,-2.361263,-0.098471,5.832688,0.024966,5.729279,0.090483,-0.815454,0.0,1.0,Female
4,賴*延,賴,*,延,賴,延,-0.248495,0.602578,0.290753,-2.990433,...,-0.079125,-0.098471,-1.431928,0.024966,-0.36886,0.090483,-1.560106,0.9,0.1,Male


# Predict gender

In [6]:
def predict_gender_for_single_name(model, x_feature, firstname, lastname="蕭"):
    name = f'蕭{firstname}'
    name_df = pd.DataFrame()
    name_df['name'] = [name]

    name_df['LastName'] = name_df.name.apply(get_last_name)
    name_df['FirstName'] = name_df.name.apply(get_first_name)

    # Fortune map - one-hot encoding
    name_df, fortune_map_feature = add_fortune_map_feature(
        name_df, moe_data_dict, special_word_dict)

    # Radical - one-hot encoding
    name_df, radical_feature = add_radical_feature(name_df, moe_data_dict)

    # Phonetic - one-hot encoding    name_df, phonetic_feature = add_phonetic_feature(
    name_df, phonetic_feature = add_phonetic_feature(
        name_df, vowels, moe_data_dict, special_word_dict)

    # W2V
    name_df, w2v_feature = add_w2v_feature(w2v_model, name_df, w2v_vector_number=100, synonyms=synonyms_dict, w2v_mean=w2v_mean)    

    for feature in x_feature:
        if feature not in name_df.columns:
            name_df[feature] = 0
        
    predicts = model.predict_proba(name_df[x_feature])
    predicts = np.round(predicts, 1)
    if demo:
        name_df["Male_prob"] = predicts[:, 1]
        name_df["Female_prob"] = predicts[:, 0]
        name_df["Gender"] = name_df[["Male_prob", "Female_prob"]].apply(lambda row: "Male" if row["Male_prob"] > row["Female_prob"] else "Female", axis=1)
        if name_df["Gender"].values[0] == "Female":
            print('我猜',name[1:],'是:女生')
        else:
            print('我猜',name[1:],'是:男生')

    return predicts

In [7]:
demo = True
prob = predict_gender_for_single_name(W_gnder_RFC_model, w2v_feature, '佩琪')
labels = ['Male','Female']
values = [prob[0][1], prob[0][0]]

trace = go.Pie(labels=labels, values=values)
iplot([trace], filename='probability of Chinese first name')

len on fortune_map_feature_list: 6
len of Radical_feature_list:  2
phonetic_feature len: 4
w2v_feature len 200
我猜 佩琪 是:女生


# Predict age interval

In [8]:

# Fortune map - one-hot encoding
print("Add fortune map feature")
name_df, fortune_map_feature = add_fortune_map_feature(
    name_df, moe_data_dict, special_word_dict)

# Radical - one-hot encoding
print("Add radical feature")
name_df, radical_feature = add_radical_feature(name_df, moe_data_dict)

# Phonetic - one-hot encoding
print("Add phonetic feature")
name_df, phonetic_feature = add_phonetic_feature(
    name_df, vowels, moe_data_dict, special_word_dict)


Add fortune map feature
len on fortune_map_feature_list: 16
Add radical feature
len of Radical_feature_list:  16
Add phonetic feature
phonetic_feature len: 25


In [11]:

with open(r"D:\code\Behind_Names\Taiwan_Name_Experiment\TrainedModel\Taiwan_name_RFC_WPFZR_feature.pkl", "rb") as f:
    Taiwan_name_RFC_WPFZR_feature = pickle.load(f)

with open(r"D:\code\Behind_Names\Taiwan_Name_Experiment\TrainedModel\Taiwan_name_RFC_WPFZR_model.pkl", "rb") as f:
    Taiwan_name_RFC_WPFZR_model = pickle.load(f)
    
for feature in Taiwan_name_RFC_WPFZR_feature:
    if feature not in name_df.columns:
        name_df[feature] = 0


In [15]:
def predict_age_for_single_name(model, x_feature, name, name_type):
    name_df = pd.DataFrame()
    name_df['name'] = [name]
    
    name_df['LastName'] = name_df.name.apply(get_last_name)
    name_df['FirstName'] = name_df.name.apply(get_first_name)

    # Fortune map - one-hot encoding
    name_df, fortune_map_feature = add_fortune_map_feature(
        name_df, moe_data_dict, special_word_dict)

    # Radical - one-hot encoding
    name_df, radical_feature = add_radical_feature(name_df, moe_data_dict)

    # Phonetic - one-hot encoding    name_df, phonetic_feature = add_phonetic_feature(
    name_df, phonetic_feature = add_phonetic_feature(
        name_df, vowels, moe_data_dict, special_word_dict)

    # W2V
    name_df, w2v_feature = add_w2v_feature(w2v_model, name_df, w2v_vector_number=100, synonyms=synonyms_dict, w2v_mean=w2v_mean)    

    for feature in x_feature:
        if feature not in name_df.columns:
            name_df[feature] = 0
        
    predicts = model.predict_proba(name_df[x_feature])

    #印預測結果
    if demo:
        if name_type=='real':
            for index, i in enumerate(predicts[0]):
                print(f"{(2022-1944)- index * 5} ~ {(2022-1948) -index * 5} 歲的機率：{round(i*100, 2)}%")
        else:
            for index,i in enumerate(predicts[0]):
                print(f"{(2022-1978)- index * 5} ~ {(2022-1982) -index * 5} 歲的機率：{round(i*100, 2)}%")
    
    return predicts
    

In [16]:
def make_probability_of_birth_year_graph(pro_array, name, name_type):
    if name_type=='real':
        trace = go.Scatter(
            x = [5*x+1945 for x in range(10)],
            y = [ pro_array[0][x] for x in range(10)]
        )
        data = [trace]
    else:
        trace = go.Scatter(
            x = [5*x+1975 for x in range(7)],
            y = [ pro_array[0][x] for x in range(7)]
        )
        data = [trace]
        
    #py.iplot(data, filename='basic-line')
    #iplot(data, filename='birthday data counting', image='png') #image='png' will download picture
    iplot({'data': data, 
                   'layout': {'title': name +'年齡分布的機率', 
                              'font': dict(size=16),'xaxis':dict(
            title='birth year',
            titlefont=dict(
                family='Arial, sans-serif',
                size=18,
                color='lightgrey'
            )),
            'yaxis':dict(
            title='prob',
            titlefont=dict(
                family='Arial, sans-serif',
                size=18,
                color='lightgrey'
            ))
                         }}, filename='probability_of_birth_year')

In [17]:
demo=True
name = '馬英九'
probability_of_age = predict_age_for_single_name(Taiwan_name_RFC_WPFZR_model, Taiwan_name_RFC_WPFZR_feature, name, name_type='real')
make_probability_of_birth_year_graph(probability_of_age,name, name_type='real')

len on fortune_map_feature_list: 6
len of Radical_feature_list:  2
phonetic_feature len: 4
w2v_feature len 200
78 ~ 74 歲的機率：15.62%
73 ~ 69 歲的機率：12.5%
68 ~ 64 歲的機率：7.81%
63 ~ 59 歲的機率：10.94%
58 ~ 54 歲的機率：6.25%
53 ~ 49 歲的機率：17.19%
48 ~ 44 歲的機率：7.81%
43 ~ 39 歲的機率：6.25%
38 ~ 34 歲的機率：9.38%
33 ~ 29 歲的機率：6.25%


In [18]:
demo=True
name = '馬英九'
probability_of_age = predict_age_for_single_name(Taiwan_name_RFC_WPFZR_model, Taiwan_name_RFC_WPFZR_feature, name, name_type='FB')
make_probability_of_birth_year_graph(probability_of_age,name,name_type='FB')

len on fortune_map_feature_list: 6
len of Radical_feature_list:  2
phonetic_feature len: 4
w2v_feature len 200
44 ~ 40 歲的機率：15.62%
39 ~ 35 歲的機率：12.5%
34 ~ 30 歲的機率：7.81%
29 ~ 25 歲的機率：10.94%
24 ~ 20 歲的機率：6.25%
19 ~ 15 歲的機率：17.19%
14 ~ 10 歲的機率：7.81%
9 ~ 5 歲的機率：6.25%
4 ~ 0 歲的機率：9.38%
-1 ~ -5 歲的機率：6.25%


# Name fortune telling

In [19]:
from name_module.fortune_map_calculate import FortuneMapCalculater
fc = FortuneMapCalculater(moe_data_dict, special_word_dict).name_fortune_telling('馬英九')

名字： 馬英九
五格
天格： 11 吉
地格： 11 吉
人格： 21 吉
外格： 3 吉
總格： 23 吉
三才
天才: 木
人才: 木
地才: 木
三才格局： 大吉

