(1) Import library

In [1]:
from collections import Counter
import numpy as np
import pandas as pd
from pathlib import Path as path_lib
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn import metrics

from name_module.preprocess import *
from name_module.share_lib import reduce_mem_usage, restore_df_dtypes
from name_module.training_module import *

pd.options.display.float_format = '{:,.4f}'.format

(2) Read Files and construct Feature for name classifier

(2-1A) Read raw Taiwanese dataset

In [2]:
data_path = plib_path("./name_data/")
file_name = "FB_Name_data.csv"
name_df = pd.read_csv(data_path / file_name)
name_df.head()

Unnamed: 0,name,BirthYear,FirstName,LastName,gender,message,userID
0,黃海銘,11,海銘,黃,1,1995,1484433631589977
1,林士傑,12,士傑,林,1,2001,296012147484045
2,翁玉妹,12,玉妹,翁,0,2003,1895288067372189
3,趙凱揚,11,凱揚,趙,1,1995,1295109403905407
4,羅文泰,6,文泰,羅,1,1972,1550108411680341


In [3]:
name_df = preprocess(name_df, data_path, file_name)

Drop Message is not number:  489736 -> 489736  drop: 0
Drop English name: from  489736 -> 489736  drop: 0
Drop last name is not in Taiwan last name list : 489736 -> 489736  drop: 0
Drop First name is longer than 3  : 489736 -> 489736  drop: 0
Add W2V feature
Memory usage of properties dataframe is : 777.1689453125  MB
Memory usage is:  394.1890563964844  MB
This is  50.72115384615385 % of the initial size
w2v_feature len 200
Add phonetic feature
phonetic_feature len: 329
Add fortune map feature
Error:  禤 get stroke count failed!
Error:  顔 get stroke count failed!
Error:  関 get stroke count failed!
Error:  関 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  顔 get stroke count failed!
Error:  関 get stroke count failed!
Error:  関 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 

In [None]:
name_df.head()

Unnamed: 0,name,BirthYear,FirstName,LastName,gender,message,userID,FN1_wv_0,FN2_wv_0,FN1_wv_1,...,Zodiac_狗,Zodiac_猴,Zodiac_羊,Zodiac_虎,Zodiac_蛇,Zodiac_豬,Zodiac_雞,Zodiac_馬,Zodiac_鼠,Zodiac_龍
0,黃海銘,11,海銘,黃,1,1995,1484433631589977,2.7937,-6.7506,-1.1792,...,0,0,0,0,0,1,0,0,0,0
1,林士傑,12,士傑,林,1,2001,296012147484045,-2.3642,-3.4724,1.0797,...,0,0,0,0,1,0,0,0,0,0
2,翁玉妹,12,玉妹,翁,0,2003,1895288067372189,-1.2031,3.1973,5.6458,...,0,0,1,0,0,0,0,0,0,0
3,趙凱揚,11,凱揚,趙,1,1995,1295109403905407,-0.1228,-3.4841,3.4411,...,0,0,0,0,0,1,0,0,0,0
4,羅文泰,6,文泰,羅,1,1972,1550108411680341,2.7781,2.4125,0.4082,...,0,0,0,0,0,0,0,0,1,0


In [4]:
gender_model_name = "WPF_gender_RFC_model.pkl"
with open('./TrainedModel/{}'.format(gender_model_name), 'rb') as handle:
    gender_model = pickle.loads(handle.read())
gender_x_feature_name = "WPF_gender_RFC_feature.pkl"
with open('./TrainedModel/{}'.format(gender_x_feature_name), 'rb') as handle:
    gender_x_feature = pickle.loads(handle.read())

In [5]:
name_df = add_gender_feature(name_df, gender_model, gender_x_feature)
name_df.head()

Unnamed: 0,name,BirthYear,FirstName,LastName,gender,message,userID,FN1_wv_0,FN2_wv_0,FN1_wv_1,...,Zodiac_蛇,Zodiac_豬,Zodiac_雞,Zodiac_馬,Zodiac_鼠,Zodiac_龍,FN1_Vowel_,FN2_Vowel_ǘ,Male_prob,Female_prob
0,黃海銘,11,海銘,黃,1,1995,1484433631589977,2.7937,-6.7506,-1.1792,...,0,1,0,0,0,0,0,0,0.5469,0.4531
1,林士傑,12,士傑,林,1,2001,296012147484045,-2.3642,-3.4724,1.0797,...,1,0,0,0,0,0,0,0,0.4062,0.5938
2,翁玉妹,12,玉妹,翁,0,2003,1895288067372189,-1.2031,3.1973,5.6458,...,0,0,0,0,0,0,0,0,0.4688,0.5312
3,趙凱揚,11,凱揚,趙,1,1995,1295109403905407,-0.1228,-3.4841,3.4411,...,0,1,0,0,0,0,0,0,0.5156,0.4844
4,羅文泰,6,文泰,羅,1,1972,1550108411680341,2.7781,2.4125,0.4082,...,0,0,0,0,1,0,0,0,0.5,0.5


In [6]:
name_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 489736 entries, 0 to 489735
Columns: 998 entries, name to Female_prob
dtypes: float32(200), float64(2), int64(2), object(3), uint16(3), uint64(1), uint8(787)
memory usage: 777.6+ MB


(2-1B) Read sampled baseline dataset for comparing methods 

In [2]:
with open('./NameData/thesis_experiment/TaiwanNames_on_FB/final_FB_name_df', 'rb') as handle:
     name_df = pickle.loads(handle.read())
with open('./NameData/thesis_experiment/TaiwanNames_on_FB/test_index', 'rb') as handle:
     test_index = pickle.loads(handle.read())
with open('./NameData/thesis_experiment/TaiwanNames_on_FB/train_index', 'rb') as handle:
     train_index = pickle.loads(handle.read())

In [6]:
name_df.head()

Unnamed: 0,name,message,userID,FirstName,FN1,FN2,FN1_muin,FN2_muin,BirthYear,GuessedGender,...,FirstName1_ratio_9,FirstName2_ratio_9,FirstName1_ratio_10,FirstName2_ratio_10,FirstName1_ratio_11,FirstName2_ratio_11,FirstName1_ratio_12,FirstName2_ratio_12,FirstName1_ratio_13,FirstName2_ratio_13
0,張家琪,1984,1216132381818443,家琪,29,384,iā,í,8,0,...,0.0082,0.0031,0.0083,0.0033,0.0101,0.0029,0.008,0.0031,0.0078,0.0037
1,涂銘峰,1983,1381624175192111,銘峰,44,233,íng,ēng,8,1,...,0.0056,0.002,0.0042,0.0025,0.0047,0.0021,0.0037,0.0012,0.0021,0.0014
2,歐昱成,1980,1442601205791441,昱成,97,113,ù,éng,8,1,...,0.002,0.0017,0.0039,0.0021,0.005,0.0018,0.0052,0.0019,0.0052,0.0009
5,張耕耀,1983,1592555917430952,耕耀,1679,12,ēng,ào,8,1,...,0.0001,0.0017,0.0002,0.0017,0.0002,0.0014,0.0001,0.0015,0.0001,0.0008
6,胡小芳,1984,1384729524883556,小芳,53,246,iǎo,āng,8,0,...,0.0251,0.0037,0.0081,0.0035,0.0025,0.0034,0.0023,0.0026,0.0047,0.0036


In [7]:
test_index

Int64Index([133037, 259987,  77026, 266255, 270063,  35689, 130950,  47773,
            104126, 173102,
            ...
             78307, 146922, 169120, 103887, 152845, 246760, 219474, 134475,
            218389, 297738],
           dtype='int64', length=90730)

In [3]:
name_df = restore_df_dtypes(df=name_df,
                  int8_col=get_x_feature(['Phonetic','Fortune_map','Zodiac','Radical'], name_df),
                  int64_col=['BirthYear', 'message', 'gender', "BirthYear "],
                  float32_col=get_x_feature(['W2V'], name_df))
name_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152788 entries, 0 to 164821
Columns: 3762 entries, name to FirstName2_ratio_13
dtypes: float32(200), float64(16), int64(6), int8(30), object(8), uint8(3502)
memory usage: 667.4+ MB


In [4]:
name_df, zodiac_feature_list = add_zodiac_feature(name_df)

len of Zodiac_feature_list:  12


In [5]:
name_df = rename_old_name_df_dict(name_df)

(3) Merge category

In [7]:
# 2-1 A
Counter(name_df.BirthYear.tolist()).most_common()

[(11, 146675),
 (12, 123568),
 (10, 83463),
 (9, 44631),
 (13, 31770),
 (8, 28405),
 (7, 14218),
 (6, 8110),
 (5, 4842),
 (4, 2582),
 (3, 930),
 (2, 378),
 (1, 122),
 (0, 42)]

In [8]:
head = 7 # FB Base
tail = 13
BIRTH_YEAR_BASE = 1940
birth_year_base = BIRTH_YEAR_BASE + head * 5
name_df["BirthYear"] = name_df.BirthYear.apply(lambda x: merge_birth_year(x, head, tail))

In [9]:
Counter(name_df.BirthYear.tolist()).most_common()

[(11, 146675),
 (12, 123568),
 (10, 83463),
 (9, 44631),
 (13, 31770),
 (7, 31224),
 (8, 28405)]

In [29]:
name_df.head()

Unnamed: 0,name,message,userID,FirstName,FN1,FN2,FN1_muin,FN2_muin,BirthYear,GuessedGender,...,Zodiac_狗,Zodiac_猴,Zodiac_羊,Zodiac_虎,Zodiac_蛇,Zodiac_豬,Zodiac_雞,Zodiac_馬,Zodiac_鼠,Zodiac_龍
0,張家琪,1984,1216132381818443,家琪,29,384,iā,í,2.0,0,...,0,0,0,0,0,0,0,0,1,0
1,涂銘峰,1983,1381624175192111,銘峰,44,233,íng,ēng,2.0,1,...,0,0,0,0,0,1,0,0,0,0
2,歐昱成,1980,1442601205791441,昱成,97,113,ù,éng,2.0,1,...,0,1,0,0,0,0,0,0,0,0
5,張耕耀,1983,1592555917430952,耕耀,1679,12,ēng,ào,2.0,1,...,0,0,0,0,0,1,0,0,0,0
6,胡小芳,1984,1384729524883556,小芳,53,246,iǎo,āng,2.0,0,...,0,0,0,0,0,0,0,0,1,0


(4) Preparing for traing model 

(4-1) Make Feature Combinations

In [10]:
feature_list = ['W2V', 'Phonetic','Fortune_map','Zodiac','Radical']
feature_list_gender = ['W2V', 'Phonetic','Fortune_map','Zodiac','Radical','uni-gram']
feature_combinations = get_all_combinations(feature_list)

(4-2) Normalize w2v

In [11]:
w2v_feature = get_x_feature(['W2V'], name_df.columns)
print("Unnormalized W2V feature")
display(name_df[w2v_feature].describe())
name_df = w2v_normalize(name_df, w2v_feature)
print("Normalized W2V feature")
display(name_df[w2v_feature].describe())

Unnormalized W2V feature


Unnamed: 0,FN1_wv_0,FN2_wv_0,FN1_wv_1,FN2_wv_1,FN1_wv_2,FN2_wv_2,FN1_wv_3,FN2_wv_3,FN1_wv_4,FN2_wv_4,...,FN1_wv_95,FN2_wv_95,FN1_wv_96,FN2_wv_96,FN1_wv_97,FN2_wv_97,FN1_wv_98,FN2_wv_98,FN1_wv_99,FN2_wv_99
count,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,...,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0
mean,-1.3831,-1.3533,0.9865,1.5856,-1.3374,-1.3298,1.1203,1.6334,0.9461,0.5497,...,0.8996,1.003,-0.7276,-0.8914,0.8519,0.4904,1.018,0.6859,-1.1867,-1.1705
std,3.2859,2.8826,3.5006,3.2423,3.1615,3.1668,3.4363,3.1633,3.4101,2.9424,...,3.5171,3.1125,3.213,3.1911,3.6667,2.9826,3.0727,2.7631,3.2623,3.2422
min,-14.5524,-14.5524,-15.1554,-15.1554,-12.1204,-12.1204,-13.6414,-13.6414,-19.8135,-19.8135,...,-15.676,-15.676,-13.2748,-13.2748,-12.2678,-12.2678,-13.1344,-14.7658,-11.9364,-11.9364
25%,-3.4724,-3.2382,-1.1152,-0.4328,-3.3948,-3.5147,-0.7009,-0.2491,-0.9913,-1.2926,...,-1.2608,-0.9547,-2.8725,-3.0075,-1.2119,-1.1962,-1.1005,-1.0386,-3.4092,-3.323
50%,-1.2031,-1.4518,1.3743,1.9974,-1.6844,-1.6666,1.4775,1.7365,0.9882,0.6071,...,1.2164,1.3655,-0.6982,-1.1253,0.6738,0.3033,0.7062,0.4593,-0.9327,-1.181
75%,0.5761,0.3718,3.5734,3.7775,0.4163,0.4154,3.2359,3.871,3.0144,2.3595,...,3.2027,3.1912,1.5215,1.2681,2.9024,2.3264,3.2944,2.6557,0.7628,0.819
max,13.8095,13.8095,13.2787,14.0238,13.6129,13.6129,15.601,15.601,12.6033,12.6033,...,13.2789,13.2789,17.0671,17.0671,15.0147,15.0147,13.1543,13.4563,14.5199,14.5199


Normalized W2V feature


Unnamed: 0,FN1_wv_0,FN2_wv_0,FN1_wv_1,FN2_wv_1,FN1_wv_2,FN2_wv_2,FN1_wv_3,FN2_wv_3,FN1_wv_4,FN2_wv_4,...,FN1_wv_95,FN2_wv_95,FN1_wv_96,FN2_wv_96,FN1_wv_97,FN2_wv_97,FN1_wv_98,FN2_wv_98,FN1_wv_99,FN2_wv_99
count,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,...,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0,489736.0
mean,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,...,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-4.0079,-4.579,-4.6111,-5.1633,-3.4107,-3.4074,-4.2958,-4.8287,-6.0878,-6.9205,...,-4.7129,-5.3587,-3.9051,-3.8806,-3.5781,-4.2775,-4.6058,-5.5921,-3.2952,-3.3206
25%,-0.6358,-0.6539,-0.6004,-0.6225,-0.6508,-0.6899,-0.53,-0.5951,-0.5682,-0.6261,...,-0.6143,-0.629,-0.6676,-0.6631,-0.5628,-0.5655,-0.6894,-0.6241,-0.6813,-0.6639
50%,0.0548,-0.0342,0.1108,0.127,-0.1097,-0.1064,0.1039,0.0326,0.0123,0.0195,...,0.0901,0.1165,0.0091,-0.0733,-0.0486,-0.0627,-0.1015,-0.082,0.0779,-0.0032
75%,0.5962,0.5985,0.739,0.676,0.5547,0.5511,0.6157,0.7074,0.6065,0.6151,...,0.6549,0.703,0.7,0.6767,0.5592,0.6156,0.7408,0.7129,0.5976,0.6137
max,4.6236,5.2602,3.5114,3.8362,4.7289,4.7185,4.2141,4.4155,3.4185,4.0964,...,3.5198,3.944,5.5383,5.6276,3.8626,4.8696,3.9497,4.6217,4.8146,4.8395


(5) Experiment - Train feature combination for RFC

(5-1 A) Training for all combination - FB resampled dataset

In [None]:
birth_year_base = BIRTH_YEAR_BASE + head * 5  
birth_years = [i for i in range(head, tail + 1)]  # for index to 1945 ~ 1994
sample_number = name_df["BirthYear"].value_counts().min()
target_names = [str(i) for i in range(birth_year_base, BIRTH_YEAR_BASE + tail * 5 + 5, 5)]
save_path = plib_path("./Training/FB/")
validation_times = 3
do_first_name_augmentation = False
model_name = "FB_RFC"
estimators_num = 16
min_samples_leaf_num = 1

train_birth_year_model(name_df, do_first_name_augmentation, validation_times,
                        feature_combinations, BIRTH_YEAR_BASE, target_names,
                        save_path, model_name, birth_years, sample_number,
                        estimators_num, min_samples_leaf_num)

(5-1 B) Training for all combination - FB base sampled dataset

In [None]:
# FB data of last
birth_year_base = BIRTH_YEAR_BASE + head * 5
birth_years = [i for i in range(head, tail + 1)]  # for index to 1945 ~ 1994
sample_number = name_df["BirthYear"].value_counts().min()
target_names = [str(i) for i in range(birth_year_base, BIRTH_YEAR_BASE + tail * 5 + 5, 5)]
save_path = plib_path("./Training/FB_Base/")
validation_times = 3
do_first_name_augmentation = False
model_name = "FB_RFC"
estimators_num = 64
min_samples_leaf_num = 1

train_birth_year_model(name_df, do_first_name_augmentation, validation_times,
                           feature_combinations, BIRTH_YEAR_BASE, target_names,
                           save_path, model_name, birth_years, sample_number,
                           estimators_num, min_samples_leaf_num)