(1) Import library

In [1]:
from collections import Counter
import numpy as np
import pandas as pd
from pathlib import Path as path_lib
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn import metrics

from name_module.preprocess import *
from name_module.share_lib import reduce_mem_usage, restore_df_dtypes
from name_module.training_module import *


pd.options.display.float_format = '{:,.4f}'.format

(2) Read Files and construct Feature for name classifier

(2-1A) Read raw Taiwanese dataset

In [2]:
data_path = plib_path("./name_data/")
file_name = "Real_Name_data.csv"
name_df = pd.read_csv(data_path / file_name)
name_df.head()

Unnamed: 0,name,BirthYear,FirstName,LastName,gender,message,userID
0,丁承先,0,承先,丁,1,1940,
1,丁昞原,0,昞原,丁,1,1940,
2,方九龍,0,九龍,方,1,1940,
3,方大錚,0,大錚,方,1,1940,
4,方文隆,0,文隆,方,1,1940,


In [3]:
name_df = preprocess(name_df, data_path, file_name)

Drop Message is not number:  1660848 -> 1660848  drop: 0
Drop English name: from  1660848 -> 1653057  drop: 7791
Drop last name is not in Taiwan last name list : 1653057 -> 1653057  drop: 0
Drop First name is longer than 3  : 1653057 -> 1653057  drop: 0
Add W2V feature
Memory usage of properties dataframe is : 2623.2593994140625  MB
Memory usage is:  1330.547435760498  MB
This is  50.72115384615385 % of the initial size
w2v_feature len 200
Add phonetic feature
phonetic_feature len: 322
Add fortune map feature
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  嘺 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  厙 get stroke count failed!
Error:  厙 get stroke count failed!
Error:  禤 get stroke count failed!
Error:  禢 get stroke count fai

In [4]:
gender_model_name = "WPF_gender_RFC_model.pkl"
with open('./TrainedModel/{}'.format(gender_model_name), 'rb') as handle:
    gender_model = pickle.loads(handle.read())
gender_x_feature_name = "WPF_gender_RFC_feature.pkl"
with open('./TrainedModel/{}'.format(gender_x_feature_name), 'rb') as handle:
    gender_x_feature = pickle.loads(handle.read())

In [5]:
name_df = add_gender_feature(name_df, gender_model, gender_x_feature)
name_df.head()

Unnamed: 0,name,BirthYear,FirstName,LastName,gender,message,userID,FN1_wv_0,FN2_wv_0,FN1_wv_1,...,Zodiac_鼠,Zodiac_龍,FN1_Vowel_,FN1_Vowel_e,FN1_Vowel_en,FN1_Vowel_uǎ,FN2_Vowel_en,FN2_Vowel_uǎi,Male_prob,Female_prob
0,丁承先,0,承先,丁,1,1940,,-0.0828,-5.7964,4.7822,...,0,1,0,0,0,0,0,0,0.5938,0.4062
1,丁昞原,0,昞原,丁,1,1940,,-1.4421,0.382,1.2154,...,0,1,0,0,0,0,0,0,0.6094,0.3906
2,方九龍,0,九龍,方,1,1940,,3.7893,-2.9803,-1.8931,...,0,1,0,0,0,0,0,0,0.4688,0.5312
3,方大錚,0,大錚,方,1,1940,,3.8134,-4.7197,-0.9371,...,0,1,0,0,0,0,0,0,0.5,0.5
4,方文隆,0,文隆,方,1,1940,,2.7781,3.8124,0.4082,...,0,1,0,0,0,0,0,0,0.4688,0.5312


(2-1B) Read sampled dataset as base for comparing methods 

In [12]:
with open('./NameData/thesis_experiment/TaiwanNames_real_Name/final_real_name_df', 'rb') as handle:
     name_df = pickle.loads(handle.read())
with open('./NameData/thesis_experiment/TaiwanNames_real_Name/test_index', 'rb') as handle:
     test_index = pickle.loads(handle.read())
with open('./NameData/thesis_experiment/TaiwanNames_real_Name/train_index', 'rb') as handle:
     train_index = pickle.loads(handle.read())

In [13]:
name_df

Unnamed: 0,FirstName,message,name,FN1,FN2,FN1_muin,FN2_muin,BirthYear,GuessedGender,BFN12P,...,FirstName1_ratio_6,FirstName2_ratio_6,FirstName1_ratio_7,FirstName2_ratio_7,FirstName1_ratio_8,FirstName2_ratio_8,FirstName1_ratio_9,FirstName2_ratio_9,FirstName1_ratio_10,FirstName2_ratio_10
0,秀珠,1947,吳秀珠,70,596,iù,ū,1,0,1,...,0.0148,0.0050,0.0052,0.0010,0.0044,0.0006,0.0021,0.0002,0.0016,0.0001
1,豐博,1945,林豐博,277,72,ēng,ó,1,1,6,...,0.0024,0.0011,0.0008,0.0010,0.0010,0.0022,0.0007,0.0025,0.0007,0.0033
2,義勝,1945,康義勝,650,156,ì,èng,1,1,1,...,0.0032,0.0032,0.0017,0.0010,0.0015,0.0023,0.0009,0.0017,0.0007,0.0012
3,麗,1946,吳麗,-1,203,-1,ì,1,0,6,...,0.0027,0.0151,0.0015,0.0022,0.0043,0.0022,0.0073,0.0013,0.0125,0.0009
5,世慶,1947,林世慶,537,269,ì,ìng,1,1,6,...,0.0046,0.0041,0.0042,0.0014,0.0033,0.0024,0.0022,0.0018,0.0016,0.0019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320775,月鳳,1974,陳月鳳,706,598,uè,èng,6,0,6,...,0.0040,0.0051,0.0011,0.0012,0.0008,0.0016,0.0005,0.0009,0.0003,0.0005
320776,麗君,1974,吳麗君,203,153,ì,ūn,6,0,6,...,0.0151,0.0062,0.0022,0.0080,0.0022,0.0131,0.0013,0.0098,0.0009,0.0077
320777,玉淑,1974,陳玉淑,211,348,ù,ú,6,0,6,...,0.0149,0.0225,0.0070,0.0078,0.0060,0.0060,0.0031,0.0032,0.0026,0.0015
320778,明珠,1974,陳明珠,159,596,íng,ū,6,0,1,...,0.0165,0.0050,0.0121,0.0010,0.0090,0.0006,0.0058,0.0002,0.0046,0.0001


In [87]:
test_index

Int64Index([139458, 160569, 273443, 210241, 276709, 236523, 118678, 173062,
            189727, 265777,
            ...
            270402, 126974,   5332, 117560,  65924, 148607,  58561, 259018,
             89498, 194784],
           dtype='int64', length=90730)

In [14]:
name_df = restore_df_dtypes(df=name_df,
                  int8_col=get_x_feature(['Phonetic','Fortune_map','Zodiac','Radical'], name_df),
                  int64_col=['BirthYear', 'message', 'gender', "BirthYear "],
                  float32_col=get_x_feature(['W2V'], name_df))
name_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302432 entries, 0 to 320779
Columns: 3355 entries, FirstName to FirstName2_ratio_10
dtypes: float32(200), float64(22), int64(6), int8(31), object(8), uint8(3088)
memory usage: 1.2+ GB


In [15]:
name_df, zodiac_feature_list = add_zodiac_feature(name_df)
name_df = rename_old_name_df_dict(name_df)

len of Zodiac_feature_list:  12


(3) Resample dataset 

In [7]:
# 2- A
Counter(name_df.BirthYear.tolist()).most_common()

[(9, 441855),
 (8, 384205),
 (10, 282646),
 (7, 260573),
 (3, 60274),
 (4, 56392),
 (2, 54838),
 (5, 32236),
 (1, 31419),
 (6, 24279),
 (0, 15678),
 (11, 6620),
 (12, 2037),
 (13, 5)]

In [None]:
# 2- B
Counter(name_df.BirthYear.tolist()).most_common()

In [8]:
head = 1
tail = 10
BIRTH_YEAR_BASE = 1940
birth_year_base = BIRTH_YEAR_BASE + head * 5  
name_df["BirthYear"] = name_df.BirthYear.apply(lambda x: merge_birth_year(x, head, tail))

In [9]:
name_df["BirthYear"].value_counts()

9     441855
8     384205
10    291308
7     260573
3      60274
4      56392
2      54838
1      47097
5      32236
6      24279
Name: BirthYear, dtype: int64

(4)Preparing for traing model 

(4-1) Make Feature Combinations

In [10]:
feature_list = ['W2V', 'Phonetic','Fortune_map','Zodiac','Radical']
feature_list_gender = ['W2V', 'Phonetic','Fortune_map','Zodiac','Radical','uni-gram']
feature_combinations = get_all_combinations(feature_list)

(4-2) Normalize w2v

In [11]:
w2v_feature = get_x_feature(['W2V'], name_df.columns)
print("Unnormalized W2V feature")
display(name_df[w2v_feature].describe())
name_df = w2v_normalize(name_df, w2v_feature)
print("Normalized W2V feature")
display(name_df[w2v_feature].describe())

Unnormalized W2V feature


Unnamed: 0,FN1_wv_0,FN2_wv_0,FN1_wv_1,FN2_wv_1,FN1_wv_2,FN2_wv_2,FN1_wv_3,FN2_wv_3,FN1_wv_4,FN2_wv_4,...,FN1_wv_95,FN2_wv_95,FN1_wv_96,FN2_wv_96,FN1_wv_97,FN2_wv_97,FN1_wv_98,FN2_wv_98,FN1_wv_99,FN2_wv_99
count,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,...,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0
mean,-1.4566,-1.3313,0.5142,1.2962,-1.4798,-1.5367,0.974,1.6704,0.9617,0.5972,...,0.9661,1.0064,-0.9295,-1.1563,0.8766,0.6219,1.1213,0.8172,-1.2455,-1.1376
std,3.5062,3.0327,3.5641,3.2839,3.1358,3.1219,3.3813,3.1398,3.4673,2.9523,...,3.5192,3.0769,3.3483,3.1693,3.7782,2.9574,3.0133,2.7325,3.3068,3.2475
min,-14.5524,-14.5524,-15.1554,-15.1554,-12.1204,-12.1204,-13.0599,-13.0599,-19.8135,-19.8135,...,-15.676,-15.676,-13.2748,-13.2748,-12.7216,-12.7216,-14.7658,-14.7658,-11.9364,-11.9364
25%,-3.9635,-3.2677,-1.9196,-0.8918,-3.3328,-3.6185,-1.0196,-0.2725,-0.87,-1.1289,...,-1.0049,-0.9547,-3.2001,-3.3835,-1.2209,-1.1163,-1.0604,-0.8447,-3.6159,-3.2902
50%,-1.2606,-1.4382,0.8509,1.652,-1.7088,-1.7895,1.3017,1.8901,1.0117,0.8402,...,1.3309,1.5669,-1.0051,-1.4055,0.6738,0.4797,0.9386,0.637,-1.2016,-1.2813
75%,0.6026,0.5287,3.1754,3.7534,0.1622,0.2574,3.2359,3.8895,3.0144,2.3995,...,3.3303,3.1105,1.2681,0.8089,2.9024,2.4191,3.2944,2.7267,0.6677,0.8745
max,14.4414,13.8095,14.0238,14.6112,13.6129,13.6129,15.601,15.601,12.6033,12.6033,...,13.2789,13.2789,17.0671,17.0671,15.0147,15.0147,13.4563,13.4563,14.5199,14.5199


Normalized W2V feature


Unnamed: 0,FN1_wv_0,FN2_wv_0,FN1_wv_1,FN2_wv_1,FN1_wv_2,FN2_wv_2,FN1_wv_3,FN2_wv_3,FN1_wv_4,FN2_wv_4,...,FN1_wv_95,FN2_wv_95,FN1_wv_96,FN2_wv_96,FN1_wv_97,FN2_wv_97,FN1_wv_98,FN2_wv_98,FN1_wv_99,FN2_wv_99
count,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,...,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0,1653057.0
mean,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-3.735,-4.3596,-4.3965,-5.0098,-3.3933,-3.3901,-4.1505,-4.6915,-5.9918,-6.9135,...,-4.7289,-5.4218,-3.687,-3.8237,-3.5991,-4.5119,-5.2722,-5.7029,-3.233,-3.3253
25%,-0.715,-0.6385,-0.6829,-0.6663,-0.5909,-0.6668,-0.5896,-0.6188,-0.5283,-0.5847,...,-0.5601,-0.6374,-0.6781,-0.7027,-0.5552,-0.5877,-0.724,-0.6082,-0.7168,-0.6628
50%,0.0559,-0.0353,0.0945,0.1083,-0.073,-0.081,0.0969,0.07,0.0144,0.0823,...,0.1036,0.1822,-0.0226,-0.0786,-0.0537,-0.0481,-0.0606,-0.066,0.0133,-0.0442
75%,0.5873,0.6133,0.7467,0.7483,0.5236,0.5747,0.669,0.7068,0.592,0.6105,...,0.6718,0.6838,0.6563,0.6201,0.5362,0.6077,0.7212,0.6988,0.5786,0.6196
max,4.5342,4.9926,3.7905,4.0547,4.813,4.8527,4.3259,4.4368,3.3575,4.0667,...,3.4987,3.9886,5.3748,5.7499,3.7421,4.8667,4.0935,4.6255,4.7676,4.8215


(5) Experiment - Train feature combination for RFC

(5-1 A) Training for all combination - resampled dataset

In [19]:
birth_year_base = BIRTH_YEAR_BASE + head * 5  
birth_years = [i for i in range(head, tail + 1)]  # for index to 1945 ~ 1994
# sample_number = name_df["BirthYear"].value_counts().min()
sample_number = 32236
target_names = [str(i) for i in range(birth_year_base, BIRTH_YEAR_BASE + tail * 5 + 5, 5)]
save_path = plib_path("./Training/Real/")
validation_times = 1
do_first_name_augmentation = False
estimators_num = 64
min_samples_leaf_num = 1
model_name = "Taiwan_name_RFC"
train_birth_year_model(name_df, do_first_name_augmentation, validation_times,
                           feature_combinations, BIRTH_YEAR_BASE, target_names,
                           save_path, model_name, birth_years, sample_number,
                           estimators_num, min_samples_leaf_num)

(322360, 988) 10
Combination 0 Training feature category: ['W2V']
len of x_feature: 200
estimators_num =  64 min_samples_leaf_num =  1 Training Data len =  203086
Finished training
Train metrics
Train_x len = 203086
Finished prdeiction
Accuracy ::  0.5957820824675261
macro precision_score ::  0.5975
micro precision_score ::  0.5958
macro recall_score ::  0.5958
micro recall_score ::  0.5958
macro F1_score ::  0.5958
micro F1_score ::  0.5958
dataset 有203086個名字 74315種名字
Multi_Train Accuracy:: 0.9995962301685001
平均年份絕對值誤差:: 0.00408693853835321
report:
               precision    recall  f1-score   support

        1945       0.65      0.67      0.66     20182
        1950       0.60      0.62      0.61     20245
        1955       0.61      0.58      0.60     20449
        1960       0.57      0.58      0.58     20240
        1965       0.54      0.58      0.56     20280
        1970       0.52      0.61      0.56     20327
        1975       0.57      0.53      0.55     20353
        19

  _warn_prf(average, modifier, msg_start, len(result))


macro recall_score ::  0.2812
micro recall_score ::  0.2811
macro F1_score ::  0.22
micro F1_score ::  0.2811
dataset 有203086個名字 74456種名字
Multi_Train Accuracy:: 0.48639000226505025
平均年份絕對值誤差:: 7.097938804250416


  _warn_prf(average, modifier, msg_start, len(result))


report:
               precision    recall  f1-score   support

        1945       0.00      0.00      0.00     20342
        1950       0.00      0.00      0.00     20222
        1955       0.24      0.21      0.23     20216
        1960       0.29      0.63      0.40     20277
        1965       0.27      0.52      0.36     20319
        1970       0.28      0.21      0.24     20406
        1975       0.32      0.52      0.39     20278
        1980       0.00      0.00      0.00     20430
        1985       0.25      0.19      0.22     20242
        1990       0.28      0.52      0.36     20354

    accuracy                           0.28    203086
   macro avg       0.19      0.28      0.22    203086
weighted avg       0.19      0.28      0.22    203086

Test metrics
Test_x len = 203086
Finished prdeiction
Accuracy ::  0.28072795790344446
macro precision_score ::  0.1932
micro precision_score ::  0.2807
macro recall_score ::  0.2799
micro recall_score ::  0.2807


  _warn_prf(average, modifier, msg_start, len(result))


macro F1_score ::  0.2197
micro F1_score ::  0.2807
dataset 有87038個名字 43192種名字
Multi_Test Accuracy:: 0.42668719409912914
平均年份絕對值誤差:: 8.380626852639077
report:
               precision    recall  f1-score   support

        1945       0.00      0.00      0.00      8732
        1950       0.00      0.00      0.00      8755
        1955       0.24      0.21      0.22      8750
        1960       0.30      0.63      0.40      8775
        1965       0.27      0.51      0.35      8750
        1970       0.27      0.20      0.23      8558
        1975       0.32      0.52      0.40      8806
        1980       0.00      0.00      0.00      8539
        1985       0.27      0.21      0.24      8733
        1990       0.27      0.52      0.35      8640

    accuracy                           0.28     87038
   macro avg       0.19      0.28      0.22     87038
weighted avg       0.19      0.28      0.22     87038

Development metrics
Development_x len = 203086
Finished prdeiction


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy ::  0.27940811515076314
macro precision_score ::  0.1926
micro precision_score ::  0.2794
macro recall_score ::  0.2811
micro recall_score ::  0.2794
macro F1_score ::  0.2192
micro F1_score ::  0.2794
dataset 有32236個名字 21084種名字
Multi_Development Accuracy:: 0.3618314927410349
平均年份絕對值誤差:: 10.012656657153492
report:
               precision    recall  f1-score   support

        1945       0.00      0.00      0.00      3162
        1950       0.00      0.00      0.00      3259
        1955       0.24      0.22      0.23      3270
        1960       0.28      0.63      0.39      3184
        1965       0.27      0.52      0.36      3167
        1970       0.28      0.21      0.24      3272
        1975       0.32      0.53      0.40      3152
        1980       0.00      0.00      0.00      3267
        1985       0.26      0.19      0.22      3261
        1990       0.28      0.52      0.36      3242

    accuracy                           0.28     32236
   macro avg       0.19 

  _warn_prf(average, modifier, msg_start, len(result))


(322360, 988) 10
Combination 4 Training feature category: ['Radical']
len of x_feature: 410
estimators_num =  64 min_samples_leaf_num =  1 Training Data len =  203086
Finished training
Train metrics
Train_x len = 203086
Finished prdeiction
Accuracy ::  0.2551776094856366
macro precision_score ::  0.2539
micro precision_score ::  0.2552
macro recall_score ::  0.2552
micro recall_score ::  0.2552
macro F1_score ::  0.2538
micro F1_score ::  0.2552
dataset 有203086個名字 74678種名字
Multi_Train Accuracy:: 0.6064622869129335
平均年份絕對值誤差:: 5.057000482554189
report:
               precision    recall  f1-score   support

        1945       0.29      0.33      0.31     20299
        1950       0.26      0.26      0.26     20369
        1955       0.25      0.22      0.24     20143
        1960       0.25      0.23      0.24     20444
        1965       0.25      0.25      0.25     20236
        1970       0.24      0.25      0.24     20285
        1975       0.23      0.20      0.21     20322
        

(           Type feature  lens  accuracy  multi_ans_accuracy  avg_dist  \
 0         Train       W   200    0.5958              0.9996    0.0041   
 1          Test       W   200    0.3025              0.6078    3.9646   
 2   Development       W   200    0.3002              0.4962    5.1456   
 3         Train       P   328    0.4384              0.8231    2.0349   
 4          Test       P   328    0.2490              0.5353    5.1962   
 ..          ...     ...   ...       ...                 ...       ...   
 88         Test    PFZR   777    0.5765              0.7057    3.9234   
 89  Development    PFZR   777    0.5710              0.6518    4.8197   
 90        Train   WPFZR   977    0.9642              1.0000    0.0000   
 91         Test   WPFZR   977    0.5706              0.7238    3.4305   
 92  Development   WPFZR   977    0.5673              0.6622    4.2889   
 
     macro_precision  micro_precision  macro_recall  micro_recall  macro_F1  \
 0            0.5975           

In [20]:
sampled_df = sample_name_df(name_df, sample_number, birth_years, True)
print("Dataset shape", sampled_df.shape, len(birth_years))

Dataset shape (322360, 988) 10


In [25]:
sampled_df['BirthYear'].value_counts()

10    32236
9     32236
8     32236
7     32236
6     32236
5     32236
4     32236
3     32236
2     32236
1     32236
Name: BirthYear, dtype: int64

(5-2B)Training for all combination - baseline dataset

In [21]:
birth_year_base = BIRTH_YEAR_BASE + head * 5  
birth_years = [i for i in range(head, tail + 1)]  # for index to 1945 ~ 1994
sample_number = name_df["BirthYear"].value_counts().min()
target_names = [str(i) for i in range(birth_year_base, BIRTH_YEAR_BASE + tail * 5 + 5, 5)]
save_path = plib_path("./Training/Real_Base/")
validation_times = 3
do_first_name_augmentation = False
model_name = "Taiwan_name_RFC"
estimators_num = 64
min_samples_leaf_num = 1

train_birth_year_model(name_df, do_first_name_augmentation, validation_times,
                           feature_combinations, BIRTH_YEAR_BASE, target_names,
                           save_path, model_name, birth_years, sample_number,
                           estimators_num, min_samples_leaf_num)

(294470, 3367) 10
Combination 0 Training feature category: ['W2V']
len of x_feature: 200
estimators_num =  64 min_samples_leaf_num =  1 Training Data len =  185516
Finished training
Train metrics
Train_x len = 185516
Finished prdeiction
Accuracy ::  0.5957976670475862
macro precision_score ::  0.5963
micro precision_score ::  0.5958
macro recall_score ::  0.5958
micro recall_score ::  0.5958
macro F1_score ::  0.5958
micro F1_score ::  0.5958
dataset 有185516個名字 57623種名字
Multi_Train Accuracy:: 0.9997682140623989
平均年份絕對值誤差:: 0.004382371331852778
report:
               precision    recall  f1-score   support

        1945       0.61      0.64      0.62     18581
        1950       0.60      0.60      0.60     18488
        1955       0.58      0.58      0.58     18529
        1960       0.58      0.54      0.56     18545
        1965       0.53      0.55      0.54     18604
        1970       0.53      0.55      0.54     18645
        1975       0.59      0.57      0.58     18520
        

ValueError: at least one array or dtype is required