(1) Import library

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path as path_lib
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from name_module.preprocess import *
from name_module.share_lib import reduce_mem_usage, restore_df_dtypes
from name_module.training_module import *
pd.options.display.float_format = '{:,.4f}'.format


(2) Preprocess or Read featured Files 

(preprocess)

In [4]:
from name_module.preprocess import preprocess
data_path = plib_path("./name_data/")
file_name = "name_with_gender_data.csv"
name_df = pd.read_csv(data_path / file_name)
name_df = preprocess(name_df, data_path, file_name)

Drop Message is not number:  138778 -> 138778  drop: 0
Drop English name: from  138778 -> 138621  drop: 157
Drop last name is not in Taiwan last name list : 138621 -> 138550  drop: 71
Drop First name is longer than 3  : 138550 -> 138501  drop: 49
Add W2V feature
Memory usage of properties dataframe is : 219.7891845703125  MB
Memory usage is:  111.47961044311523  MB
This is  50.72115384615385 % of the initial size
w2v_feature len 200
Add phonetic feature
phonetic_feature len: 323
Add fortune map feature
len on fortune_map_feature_list: 27
Add radical feature
len of Radical_feature_list:  401
Add zodiac feature
len of Zodiac_feature_list:  12
Memory usage of properties dataframe is : 214.37370586395264  MB
Memory usage is:  212.78868770599365  MB
This is  99.26062846580406 % of the initial size


Read featured Files

In [5]:
path = './NameData/name_with_gender_data_featured.csv'
name_df = pd.read_csv(path, dtype='str')

In [6]:
display(name_df.head())
name_df.info()

Unnamed: 0,name,gender,message,userID,BirthYear,LastName,FirstName,FN1_wv_0,FN2_wv_0,FN1_wv_1,...,Zodiac_狗,Zodiac_猴,Zodiac_羊,Zodiac_虎,Zodiac_蛇,Zodiac_豬,Zodiac_雞,Zodiac_馬,Zodiac_鼠,Zodiac_龍
0,丁承先,1,1940,103845999999999,0,丁,承先,-0.08277837,-5.796417,4.7821665,...,0,0,0,0,0,0,0,0,0,1
1,丁昞原,1,1940,103845999999999,0,丁,昞原,-1.4421005,0.3819639,1.2154223,...,0,0,0,0,0,0,0,0,0,1
2,方超,1,1940,103845999999999,0,方,超,-0.24849509,-3.5968602,0.29075345,...,0,0,0,0,0,0,0,0,0,1
3,方九龍,1,1940,103845999999999,0,方,九龍,3.7893195,-2.9803402,-1.8931139,...,0,0,0,0,0,0,0,0,0,1
4,方大錚,1,1940,103845999999999,0,方,大錚,3.8133733,-4.719679,-0.9371139,...,0,0,0,0,0,0,0,0,0,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138501 entries, 0 to 138500
Columns: 972 entries, name to Zodiac_龍
dtypes: object(972)
memory usage: 1.0+ GB


In [7]:
name_df = restore_df_dtypes(df=name_df,
                  int8_col=get_x_feature(['Phonetic','Fortune_map','Zodiac','Radical'], name_df),
                  int64_col=['BirthYear', 'message', 'gender', "BirthYear "],
                  float32_col=get_x_feature(['W2V'], name_df)
                  )
name_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138501 entries, 0 to 138500
Columns: 972 entries, name to Zodiac_龍
dtypes: float32(200), int64(3), int8(763), object(6)
memory usage: 216.0+ MB


In [8]:
name_df.select_dtypes('object')

Unnamed: 0,name,userID,LastName,FirstName,FN1,FN2
0,丁承先,103845999999999,丁,承先,256,908
1,丁昞原,103845999999999,丁,昞原,3763,702
2,方超,103845999999999,方,超,-1,884
3,方九龍,103845999999999,方,九龍,987,69
4,方大錚,103845999999999,方,大錚,755,1041
...,...,...,...,...,...,...
138496,程麗庭,259329000000000,程,麗庭,203,46
138497,許家榛,505552000000000,許,家榛,29,793
138498,田崑成,138986000000000,田,崑成,1158,113
138499,筱欣,1282160000000000,筱,欣,-1,78


(3) Prepare feature for learning

In [9]:
name_df[["gender","FirstName"]].describe()

Unnamed: 0,gender
count,138501.0
mean,0.5864
std,0.4925
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


(3-1) Make Feature Combinations

In [10]:
feature_list = ['W2V', 'Phonetic','Fortune_map','Zodiac','Radical']
feature_list_gender = ['W2V', 'Phonetic','Fortune_map','Zodiac','Radical','uni-gram']

In [11]:
feature_combinations_gender = get_all_combinations(feature_list)
print(feature_combinations_gender[:8])

[['W2V'], ['Phonetic'], ['Fortune_map'], ['Zodiac'], ['Radical'], ['W2V', 'Phonetic'], ['W2V', 'Fortune_map'], ['W2V', 'Zodiac']]


(3-2) Add most gender as y feature

In [12]:
sampled_df = name_df

In [13]:
name_gender_dict = name_gender_count(sampled_df, random_gender_for_same_count=True)
# print(name_gender_dict)

In [15]:
sampled_df['mgender'] = sampled_df.FirstName.apply(lambda name: add_most_gender(name, name_gender_dict))

In [16]:
sampled_df.head()

Unnamed: 0,name,gender,message,userID,BirthYear,LastName,FirstName,FN1_wv_0,FN2_wv_0,FN1_wv_1,...,Zodiac_猴,Zodiac_羊,Zodiac_虎,Zodiac_蛇,Zodiac_豬,Zodiac_雞,Zodiac_馬,Zodiac_鼠,Zodiac_龍,mgender
0,丁承先,1,1940,103845999999999,0,丁,承先,-0.0828,-5.7964,4.7822,...,0,0,0,0,0,0,0,0,1,1
1,丁昞原,1,1940,103845999999999,0,丁,昞原,-1.4421,0.382,1.2154,...,0,0,0,0,0,0,0,0,1,1
2,方超,1,1940,103845999999999,0,方,超,-0.2485,-3.5969,0.2908,...,0,0,0,0,0,0,0,0,1,1
3,方九龍,1,1940,103845999999999,0,方,九龍,3.7893,-2.9803,-1.8931,...,0,0,0,0,0,0,0,0,1,1
4,方大錚,1,1940,103845999999999,0,方,大錚,3.8134,-4.7197,-0.9371,...,0,0,0,0,0,0,0,0,1,1


In [17]:
sampled_df.gender.value_counts()

1    81217
0    57284
Name: gender, dtype: int64

In [18]:
sampled_df.mgender.value_counts()

1    81501
0    57000
Name: mgender, dtype: int64

(3-3) Normalize w2v

In [19]:
w2v_feature = get_x_feature(['W2V'], name_df.columns)
print("Unnormalized W2V feature")
display(sampled_df[w2v_feature].describe())
sampled_df = w2v_normalize(sampled_df, w2v_feature)
print("Normalized W2V feature")
display(sampled_df[w2v_feature].describe())

Unnormalized W2V feature


Unnamed: 0,FN1_wv_0,FN2_wv_0,FN1_wv_1,FN2_wv_1,FN1_wv_2,FN2_wv_2,FN1_wv_3,FN2_wv_3,FN1_wv_4,FN2_wv_4,...,FN1_wv_95,FN2_wv_95,FN1_wv_96,FN2_wv_96,FN1_wv_97,FN2_wv_97,FN1_wv_98,FN2_wv_98,FN1_wv_99,FN2_wv_99
count,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,...,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0
mean,-1.1645,-1.1429,0.3526,0.9852,-1.2973,-1.2855,0.9782,1.6493,0.9413,0.6401,...,0.8607,0.8552,-0.8138,-1.0272,0.6335,0.4953,0.9766,0.8603,-1.1736,-1.0593
std,3.4881,3.3237,3.4739,3.3374,3.0948,3.2001,3.2964,3.2597,3.35,3.2174,...,3.3469,3.1373,3.3291,3.1589,3.5051,3.0882,2.8938,2.8273,3.3708,3.3696
min,-14.5524,-14.5524,-15.1554,-15.1554,-10.7705,-12.1204,-13.0599,-13.0599,-19.8135,-19.8135,...,-13.6717,-15.676,-11.8404,-13.2748,-10.9608,-10.6345,-13.0968,-13.0968,-11.7162,-11.9364
25%,-3.4284,-3.2677,-1.9434,-1.2865,-3.0247,-3.4903,-0.7009,-0.3147,-0.8003,-1.3216,...,-0.9263,-0.9843,-3.092,-3.061,-1.2209,-1.2209,-0.8949,-1.0184,-3.538,-3.2367
50%,-0.8499,-1.1731,0.2908,1.1854,-1.6844,-1.6695,1.1102,1.8104,0.9692,0.753,...,0.8664,1.2164,-0.6123,-1.2918,0.2708,0.4596,0.6719,0.6295,-0.9067,-1.0216
75%,0.8807,0.8378,2.884,3.4411,0.3284,0.5669,3.2359,3.9171,3.0144,2.652,...,2.9818,2.9095,1.2207,0.9917,2.5604,2.2221,2.9601,2.9601,0.6545,0.8822
max,13.8095,13.8095,13.2787,12.6276,12.5772,13.6129,15.601,15.601,12.6033,12.6033,...,13.2789,13.2789,17.0671,17.0671,15.0147,14.5679,13.1543,13.1543,14.5199,14.3817


Normalized W2V feature


Unnamed: 0,FN1_wv_0,FN2_wv_0,FN1_wv_1,FN2_wv_1,FN1_wv_2,FN2_wv_2,FN1_wv_3,FN2_wv_3,FN1_wv_4,FN2_wv_4,...,FN1_wv_95,FN2_wv_95,FN1_wv_96,FN2_wv_96,FN1_wv_97,FN2_wv_97,FN1_wv_98,FN2_wv_98,FN1_wv_99,FN2_wv_99
count,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,...,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0,138501.0
mean,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-3.8382,-4.0345,-4.4642,-4.8363,-3.061,-3.3858,-4.2586,-4.5125,-6.1955,-6.3573,...,-4.3421,-5.2693,-3.3122,-3.8772,-3.3078,-3.6039,-4.8634,-4.9366,-3.1276,-3.228
25%,-0.649,-0.6393,-0.661,-0.6807,-0.5582,-0.689,-0.5094,-0.6025,-0.5199,-0.6097,...,-0.5339,-0.5863,-0.6843,-0.6438,-0.529,-0.5557,-0.6467,-0.6645,-0.7014,-0.6462
50%,0.0902,-0.0091,-0.0178,0.06,-0.1251,-0.12,0.04,0.0494,0.0083,0.0351,...,0.0017,0.1151,0.0605,-0.0838,-0.1035,-0.0115,-0.1053,-0.0816,0.0792,0.0112
75%,0.5863,0.5959,0.7287,0.7359,0.5253,0.5788,0.6849,0.6957,0.6189,0.6253,...,0.6337,0.6548,0.6111,0.6391,0.5497,0.5592,0.6854,0.7427,0.5423,0.5762
max,4.2929,4.4987,3.7209,3.4885,4.4831,4.6556,4.4361,4.2801,3.4812,3.7183,...,3.7104,3.9601,5.3711,5.728,4.1029,4.5569,4.2082,4.3483,4.6557,4.5825


In [17]:
sampled_df.head()

Unnamed: 0,name,gender,message,userID,BirthYear,LastName,FirstName,FN1_wv_0,FN2_wv_0,FN1_wv_1,...,Zodiac_猴,Zodiac_羊,Zodiac_虎,Zodiac_蛇,Zodiac_豬,Zodiac_雞,Zodiac_馬,Zodiac_鼠,Zodiac_龍,mgender
0,丁承先,1,1940,103845999999999,0,丁,承先,0.3101,-1.4001,1.2751,...,0,0,0,0,0,0,0,0,1,1
1,丁昞原,1,1940,103845999999999,0,丁,昞原,-0.0796,0.4588,0.2484,...,0,0,0,0,0,0,0,0,1,1
2,方超,1,1940,103845999999999,0,方,超,0.2626,-0.7383,-0.0178,...,0,0,0,0,0,0,0,0,1,1
3,方九龍,1,1940,103845999999999,0,方,九龍,1.4202,-0.5528,-0.6465,...,0,0,0,0,0,0,0,0,1,1
4,方大錚,1,1940,103845999999999,0,方,大錚,1.4271,-1.0761,-0.3713,...,0,0,0,0,0,0,0,0,1,1


(4) Train gender RFC, try all combination for experiment

In [20]:
sampled_df[sampled_df.apply(lambda x: x['gender'] != x.mgender, axis=1)]

Unnamed: 0,name,gender,message,userID,BirthYear,LastName,FirstName,FN1_wv_0,FN2_wv_0,FN1_wv_1,...,Zodiac_猴,Zodiac_羊,Zodiac_虎,Zodiac_蛇,Zodiac_豬,Zodiac_雞,Zodiac_馬,Zodiac_鼠,Zodiac_龍,mgender
57,何可,1,1940,103845999999999,0,何,可,0.2626,-0.3360,-0.0178,...,0,0,0,0,0,0,0,0,1,0
82,吳國瑜,1,1940,103845999999999,0,吳,國瑜,1.1941,-0.1253,-0.3580,...,0,0,0,0,0,0,0,0,1,0
121,李湘渝,1,1940,103845999999999,0,李,湘渝,0.0419,0.7856,-0.1182,...,0,0,0,0,0,0,0,0,1,0
137,沈春秀,1,1940,103845999999999,0,沈,春秀,0.8364,-1.4698,-1.1377,...,0,0,0,0,0,0,0,0,1,0
162,林圓,1,1940,103845999999999,0,林,圓,0.2626,0.6256,-0.0178,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138403,陳振愷,0,2004,353054000000000,12,陳,振愷,-0.8594,0.2767,-0.8128,...,1,0,0,0,0,0,0,0,0,1
138405,元嘎,0,2004,129418000000000,12,元,嘎,0.2626,1.2589,-0.0178,...,1,0,0,0,0,0,0,0,0,1
138412,郭政豪,0,2002,268428000000000,12,郭,政豪,-0.1807,-0.7420,-1.0762,...,0,0,0,0,0,0,1,0,0,1
138413,黃微明,0,1999,913839000000000,11,黃,微明,0.0289,-3.3857,-0.8520,...,0,0,0,0,0,0,0,0,0,1


In [21]:
save_path = plib_path("./")
result_all_io = open(save_path / "Training" / "result_all.csv", "w")
result_all_io.write("Type,feature,lens,accuracy,F1,Precision,Recall\n")

result_io = open(save_path / "Training" / "result.csv", "w")
result_io.write("feature,lens,accuracy,F1,Precision,Recall\n")

do_first_name_augmentation = False
validation_times = 1
y_feature = 'mgender'
saved_model, max_acu = None, 0
saved_feature, saved_feature_category = None, None

for i, feature in enumerate(feature_combinations_gender):
    x_feature = get_x_feature(feature, name_df.columns)
    feature_category = ''.join([x[0].upper() for x in feature]).upper()

    print("Combination {} Training feature category: {}".format(i, feature))
    print("len of x_feature:", len(x_feature))
    acc_n, pres_n, rec_n, f1_N = [], [], [], []
    for test_time in range(validation_times):
        dev_df = sampled_df.sample(n=len(name_df) // 10, frac=None, replace=False, weights=None, random_state=None, axis=0)
        if do_first_name_augmentation:
            train_x, test_x, train_y, test_y = split_dataset(FN_augmentation(sampled_df.drop(dev_df.index)), 0.7, x_feature, y_feature)
        else:
            train_x, test_x, train_y, test_y = split_dataset(sampled_df.drop(dev_df.index), 0.7, x_feature, y_feature)
        # Create random forest classifier instance
        trained_model = random_forest_classifier(
            train_x, train_y.values.reshape(-1, 1).ravel(), estimators_num=64, min_samples_leaf_num=1)
        print('Finished training')
        predictions = trained_model.predict(test_x)
        print('Finished prdeiction')
        
        for item in ("Train" , "Test", "Development"):
            print("{} Metrics".format(item))
            print("{}_x len = {}".format(item, len(train_x)))
            if item == "Train":
                accuracy, precision, recall, F1 = RFC_metrics(train_x, train_y, trained_model)
            elif item == "Test":
                accuracy, precision, recall, F1 = RFC_metrics(test_x, test_y, trained_model)
            else:
                accuracy, precision, recall, F1 = RFC_metrics(dev_df[x_feature], dev_df[y_feature], trained_model)
                target_names = ['女性','男性']
                print("report:\n", classification_report(
                    dev_df[y_feature], trained_model.predict(dev_df[x_feature]), target_names=target_names))
                
            result_all_io.write("{},{},{},{},{},{},{}\n".format(
                item, feature_category, len(x_feature), accuracy, precision, recall, F1))
            print("\n")
            
        acc_n.append(accuracy)
        pres_n.append(precision)
        rec_n.append(recall)
        f1_N.append(F1)
    avg_accuracy = round(np.array(acc_n).mean(), 4)
    avg_precision = round(np.array(pres_n).mean(), 4)
    avg_recall = round(np.array(rec_n).mean(), 4)
    avg_F1 = round(np.array(f1_N).mean(), 4)
    print("Average accuracy: {}".format(avg_accuracy))
    result_io.write("{},{},{},{},{},{}\n".format(feature_category, len(x_feature), avg_accuracy, avg_precision, avg_recall, avg_F1))
    
    if saved_model is None or max_acu < avg_accuracy:
        saved_model = trained_model
        max_acu = avg_accuracy
        saved_feature = x_feature
        saved_feature_category = feature_category
result_all_io.close()
result_io.close()

if saved_model is not None:
    model_name = "{}_gender_RFC_model.pkl".format(saved_feature_category)
    feature_name = "{}_gender_RFC_feature.pkl".format(saved_feature_category)
    with open(save_path / "TrainedModel" / model_name, 'wb') as handle:
        pickle.dump(saved_model, handle)
    with open(save_path / "TrainedModel" / feature_name, 'wb') as handle:
        pickle.dump(saved_feature, handle)
    print("Output model Done.")
    

Combination 0 Training feature category: ['W2V']
len of x_feature: 200
estimators_num =  64 min_samples_leaf_num =  1 Training Data len =  87255
Finished training
Finished prdeiction
Train Metrics
Train_x len = 87255
Accuracy ::  0.9997
Precision_score ::  0.9998
Recall_score ::  0.9997
F1_score ::  0.9998


Test Metrics
Test_x len = 87255
Accuracy ::  0.9407
Precision_score ::  0.9435
Recall_score ::  0.9574
F1_score ::  0.9504


Development Metrics
Development_x len = 87255
Accuracy ::  0.9434
Precision_score ::  0.943
Recall_score ::  0.9616
F1_score ::  0.9522
report:
               precision    recall  f1-score   support

          女性       0.94      0.92      0.93      5725
          男性       0.94      0.96      0.95      8125

    accuracy                           0.94     13850
   macro avg       0.94      0.94      0.94     13850
weighted avg       0.94      0.94      0.94     13850



Average accuracy: 0.9434
Combination 1 Training feature category: ['Phonetic']
len of x_fea

(5) Save selected model

In [None]:
feature =['W2V']
x_feature = get_x_feature(feature, name_df.columns)
f = ''.join([x[0].upper() for x in feature]).upper()
fileName = f + "_gnder_RFC_model.pkl"

In [None]:
def train_and_save_model(x_feature, y_feature, model_name, feature_name, sampled_df):
    train_x, test_x, train_y, test_y = split_dataset(FN_augmentation(sampled_df.drop(dev_df.index)), 0.7, x_feature, y_feature)
    trained_model = random_forest_classifier(train_x, train_y.values.reshape(-1, 1).ravel(), estimators_num=64, min_samples_leaf_num=1)
    with open(save_path / "TrainedModel" / model_name, 'wb') as handle:
        pickle.dump(trained_model, handle)
    with open(save_path / "TrainedModel" / feature_name, 'wb') as handle:
        pickle.dump(x_feature, handle)
    print("Output model Done.")
    
    train_x, test_x, train_y, test_y = split_dataset(FN_augmentation(sampled_df.drop(dev_df.index)), 0.7, x_feature, y_feature)

    trained_model = random_forest_classifier(train_x, train_y.values.reshape(-1, 1).ravel(), estimators_num=64, min_samples_leaf_num=1)
    print('Finished training')
    predictions = trained_model.predict(test_x)
    print('Finished prdeiction')

    print("Train Metrics")
    print("train_x len ", len(train_x))
    Accuracy, precision, recall, F1 = RFC_metrics(train_x, train_y, trained_model)
    print("\nTest Metrics")
    print("Test len ", len(test_x))
    Accuracy, precision, recall, F1 = RFC_metrics(test_x, test_y, trained_model)

    with open(save_path / "TrainedModel" / model_name, 'wb') as handle:
        pickle.dump(trained_model, handle)
    with open(save_path / "TrainedModel" / feature_name, 'wb') as handle:
        pickle.dump(x_feature, handle)
    print("Output model Done.")
