In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('name.csv.gz')

In [3]:
df.head()

Unnamed: 0,state,gender,year,name,number
0,AK,M,1910,Paul,6
1,AL,F,1910,Carrie,122
2,AL,F,1910,Essie,76
3,AL,F,1910,Inez,65
4,AL,F,1910,Josephine,60


In [4]:
# slicing name to map each index
char_index = {char:index+1 for index,char in enumerate('abcdefghijklmnopqrsutvwxyz')}
def slicer(i, name):
    try:
        char = name[i].lower()
        return char_index[char]
    except:
        return 0
for i in range(20):
    df[f'name_index_{i:04d}'] = df['name'].apply(lambda x:slicer(i, x)).astype(np.int16)
df.head()

Unnamed: 0,state,gender,year,name,number,name_index_0000,name_index_0001,name_index_0002,name_index_0003,name_index_0004,...,name_index_0010,name_index_0011,name_index_0012,name_index_0013,name_index_0014,name_index_0015,name_index_0016,name_index_0017,name_index_0018,name_index_0019
0,AK,M,1910,Paul,6,16,1,20,12,0,...,0,0,0,0,0,0,0,0,0,0
1,AL,F,1910,Carrie,122,3,1,18,18,9,...,0,0,0,0,0,0,0,0,0,0
2,AL,F,1910,Essie,76,5,19,19,9,5,...,0,0,0,0,0,0,0,0,0,0
3,AL,F,1910,Inez,65,9,14,5,26,0,...,0,0,0,0,0,0,0,0,0,0
4,AL,F,1910,Josephine,60,10,15,19,5,16,...,0,0,0,0,0,0,0,0,0,0


In [11]:
state_index = { state:index for index, state in enumerate(set(df['state'].tolist())) }
df[f'state_index'] = df['state'].apply(lambda x:state_index[x]).astype(np.int16)

In [12]:
y =  df[ 'gender' ].apply(lambda x: 1.0 if x == "F" else 0.0)

In [13]:
y.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: gender, dtype: float64

In [14]:
df.drop(['state', 'gender', 'name'], axis=1).head()

Unnamed: 0,year,number,name_index_0000,name_index_0001,name_index_0002,name_index_0003,name_index_0004,name_index_0005,name_index_0006,name_index_0007,...,name_index_0011,name_index_0012,name_index_0013,name_index_0014,name_index_0015,name_index_0016,name_index_0017,name_index_0018,name_index_0019,state_index
0,1910,6,16,1,20,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,32
1,1910,122,3,1,18,18,9,5,0,0,...,0,0,0,0,0,0,0,0,0,4
2,1910,76,5,19,19,9,5,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,1910,65,9,14,5,26,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,1910,60,10,15,19,5,16,8,9,14,...,0,0,0,0,0,0,0,0,0,4


In [15]:
predictors = [f'name_index_{i:04d}' for i in range(20)] + ['year', 'number', 'state_index']
categorical = [f'name_index_{i:04d}' for i in range(20)] + ['state_index']
from sklearn.model_selection import train_test_split
xtr, xva, ytr, yva = train_test_split(df.drop(['state', 'gender', 'name'], axis=1), y, test_size=0.10, random_state=23)

In [16]:
print(xva.shape)

(555246, 23)


In [20]:
import lightgbm as lgb

lgtrain = lgb.Dataset(xtr, ytr.values,
                feature_name=predictors,
                categorical_feature = categorical)
lgvalid = lgb.Dataset(xva, yva.values,
                feature_name=predictors,
                categorical_feature = categorical)

In [21]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 15,
    'num_leaves': 33,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    # 'bagging_freq': 5,
    'learning_rate': 0.9,
    'verbose': 0
}  
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    num_boost_round=16000,
    valid_sets=[lgtrain, lgvalid],
    valid_names=['train','valid'],
    early_stopping_rounds=200,
    verbose_eval=200
)

lgb_clf.save_model('model')



Training until validation scores don't improve for 200 rounds.
[200]	train's auc: 0.996698	valid's auc: 0.995835
[400]	train's auc: 0.998236	valid's auc: 0.996879
[600]	train's auc: 0.998857	valid's auc: 0.997181
[800]	train's auc: 0.999204	valid's auc: 0.997339
[1000]	train's auc: 0.999429	valid's auc: 0.997401
[1200]	train's auc: 0.999586	valid's auc: 0.997425
[1400]	train's auc: 0.999695	valid's auc: 0.997455
[1600]	train's auc: 0.999775	valid's auc: 0.99748
[1800]	train's auc: 0.999836	valid's auc: 0.997471
Early stopping, best iteration is:
[1604]	train's auc: 0.999777	valid's auc: 0.997482


In [25]:
lgb_clf = lgb.Booster(model_file='model')
ypr = lgb_clf.predict(xva) 

In [28]:
ypr = pd.DataFrame(ypr)
ypr.columns = ['predict']
ypr.head()

Unnamed: 0,predict
0,0.9904527
1,0.92052
2,6.428625e-08
3,0.9997401
4,0.002164566


In [42]:
yva = pd.DataFrame(yva).reset_index()
yva = yva.drop(['index', 'level_0'], axis=1)
yva.head()

Unnamed: 0,gender
0,1.0
1,1.0
2,0.0
3,1.0
4,0.0


In [43]:
ys = pd.concat([yva, ypr], axis=1)
ys.head()

Unnamed: 0,gender,predict
0,1.0,0.9904527
1,1.0,0.92052
2,0.0,6.428625e-08
3,1.0,0.9997401
4,0.0,0.002164566


In [44]:
ys.to_csv('ys.csv', index=None)