# Explore Categorical Feature Encodings
This notebook will compare a couple of different encodings for categorical features and compare ROC-AUC for each model.

In [24]:
# Load requirements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, roc_auc_score
from category_encoders import TargetEncoder, WOEEncoder

import xgboost as xgb
from xgboost import XGBClassifier


Start by loading the data

In [25]:

df_train = pd.read_csv('data/raw/train.csv').drop(columns=['id'])
df_test = pd.read_csv('data/raw/test.csv').drop(columns=['id'])

display(df_train)

target = ['y']
cats = ['job','marital','education','default','housing','loan','contact','month','poutcome']
nums = ['age', 'balance','day', 'duration', 'capaign','pdays']


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,29,services,single,secondary,no,1282,no,yes,unknown,4,jul,1006,2,-1,0,unknown,1
749996,69,retired,divorced,tertiary,no,631,no,no,cellular,19,aug,87,1,-1,0,unknown,0
749997,50,blue-collar,married,secondary,no,217,yes,no,cellular,17,apr,113,1,-1,0,unknown,0
749998,32,technician,married,secondary,no,-274,no,no,cellular,26,aug,108,6,-1,0,unknown,0


For an initial test of the dataset I use XGB. XGB is very forgiving and you can run almost any type of dataset. However, I like to impute nans. 

In [26]:
print(df_train.isnull().sum())
print(df_test.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64


Let's create a dataframe to summarize the expeeriment

In [27]:
cols = ['Coding Algo', 'Number of Features', 'Average Logloss', 'Average ROC-AUC']
summary_dict = {k:[] for k in cols}
print(summary_dict)

{'Coding Algo': [], 'Number of Features': [], 'Average Logloss': [], 'Average ROC-AUC': []}


Let's start modeling with my favoprite, Label encoding. I start by replacing existing labels with number labels. For XGB we don't have to, it would take care of it. 

In [28]:
def ord_trans(df, df1, cats):
    train_len = len(df)
    df_temp = pd.concat([df, df1], axis=0)
    
    for name in cats:
        df_temp[name], _ = df_temp[name].factorize()
        
    df = df_temp.iloc[:train_len,:].copy()
    df1 = df_temp.iloc[train_len:,:].copy()
    df1 = df1.drop(columns=['y'])
    return df, df1

print(df_train.shape, df_test.shape)
df_label_train, df_label_test = ord_trans(df_train, df_test, cats)
print(df_train.shape, df_test.shape)

(750000, 17) (250000, 16)
(750000, 17) (250000, 16)


Use cross-validation make model

In [29]:

df_y = df_label_train[['y']].copy()
df_X = df_label_train.drop(columns=['y']).copy()
Xtest = df_label_test.copy()

KFOLD = 5
kf = KFold(n_splits=KFOLD, shuffle=True, random_state=1337)

fold_loglosses = []
fold_metrics = []
for i,(train_index, valid_index) in enumerate(kf.split(df_X)):
    Xtrain = df_X.iloc[train_index]
    ytrain = df_y.iloc[train_index]
    Xvalid = df_X.iloc[valid_index]
    yvalid = df_y.iloc[valid_index]
    
    # XGB    
    # Early stopping call back, use to get best model back
    es = xgb.callback.EarlyStopping(
    rounds=50,
    min_delta=1e-3,
    save_best=True,
    maximize=False,
    data_name="validation_0",
    metric_name="logloss",)
    
    model = XGBClassifier(tree_method='hist',
                          n_estimators=2000, 
                          objective='binary:logistic',
                          early_stopping_rounds=100, 
                          enable_categorical=True, 
                          eval_metric=['logloss', 'auc'],
                          n_jobs=4,
                          random_state=1337,
                          callbacks=[es],
                          
                          max_depth = 6,)
    
    model = model.fit(Xtrain, ytrain, 
                      eval_set=[(Xvalid, yvalid)],
                      verbose=50)
    
    # predict
    ypred_proba = model.predict_proba(Xvalid)
    ypred = model.predict(Xvalid)
    fold_logloss = log_loss(yvalid, ypred_proba)
    fold_metric = roc_auc_score(yvalid, ypred_proba[:,1])
    
    # save
    fold_loglosses.append(fold_logloss)
    fold_metrics.append(fold_metric)
    print(f"Fold {i+1} Log Loss: {fold_logloss:.5f}, AUC_ROC: {fold_metric:.5f}")
    

print(f"\nOverall Score, logloss: {np.mean(fold_loglosses):.5f}, auc: {np.mean(fold_metrics):.5f}")

summary_list = ['Label', str(Xtrain.shape[1]), str(round(np.mean(fold_loglosses),5)), str(round(np.mean(fold_metrics),5))]
for i, (k, v) in enumerate(summary_dict.items()):
    v.append(summary_list[i])



[0]	validation_0-logloss:0.26203	validation_0-auc:0.93672
[50]	validation_0-logloss:0.15125	validation_0-auc:0.96456
[100]	validation_0-logloss:0.14834	validation_0-auc:0.96605
[150]	validation_0-logloss:0.14728	validation_0-auc:0.96654
[190]	validation_0-logloss:0.14707	validation_0-auc:0.96665
Fold 1 Log Loss: 0.14707, AUC_ROC: 0.96665
[0]	validation_0-logloss:0.26362	validation_0-auc:0.93599
[50]	validation_0-logloss:0.15177	validation_0-auc:0.96481
[100]	validation_0-logloss:0.14893	validation_0-auc:0.96621
[150]	validation_0-logloss:0.14784	validation_0-auc:0.96673
[164]	validation_0-logloss:0.14765	validation_0-auc:0.96680
Fold 2 Log Loss: 0.14765, AUC_ROC: 0.96680
[0]	validation_0-logloss:0.26366	validation_0-auc:0.93438
[50]	validation_0-logloss:0.15347	validation_0-auc:0.96352
[100]	validation_0-logloss:0.15029	validation_0-auc:0.96514
[150]	validation_0-logloss:0.14877	validation_0-auc:0.96591
[200]	validation_0-logloss:0.14822	validation_0-auc:0.96619
[208]	validation_0-logl

Try one-hot-encoding. get_dummies returns bool type which will be run as categorical in xgboos models

In [30]:
new_cols = 0
for name in cats:
    new_cols += np.max(df_train[name].nunique() - 2,0)
print('number of new cols: ', new_cols)

print('number of current cols: ', df_train.shape)

df_ohe_train = pd.get_dummies(df_train, prefix=cats, dummy_na=False, 
                           columns=cats, drop_first=True)
df_ohe_test = pd.get_dummies(df_test, prefix=cats, columns=cats, drop_first=True)

print('total new columns: ', df_ohe_train.shape)
print('should be 17 + 26 = ', 17+26)
print('new cols in test: ', df_ohe_test.shape)
print('should be: ', df_test.shape[1] + 26)

number of new cols:  26
number of current cols:  (750000, 17)
total new columns:  (750000, 43)
should be 17 + 26 =  43
new cols in test:  (250000, 42)
should be:  42


let make the model

In [31]:

df_y = df_ohe_train[['y']].copy()
df_X = df_ohe_train.drop(columns=['y']).copy()
Xtest = df_ohe_test.copy()

KFOLD = 5
kf = KFold(n_splits=KFOLD, shuffle=True, random_state=1337)

fold_loglosses = []
fold_metrics = []
for i,(train_index, valid_index) in enumerate(kf.split(df_X)):
    Xtrain = df_X.iloc[train_index]
    ytrain = df_y.iloc[train_index]
    Xvalid = df_X.iloc[valid_index]
    yvalid = df_y.iloc[valid_index]
    
    # XGB    
    # Early stopping call back, use to get best model back
    es = xgb.callback.EarlyStopping(
    rounds=50,
    min_delta=1e-3,
    save_best=True,
    maximize=False,
    data_name="validation_0",
    metric_name="logloss",)
    
    model = XGBClassifier(tree_method='hist',
                          n_estimators=2000, 
                          objective='binary:logistic',
                          early_stopping_rounds=100, 
                          enable_categorical=True, 
                          eval_metric=['logloss', 'auc'],
                          n_jobs=4,
                          random_state=1337,
                          callbacks=[es],
                           
                          max_depth = 6,)
    
    model = model.fit(Xtrain, ytrain, 
                      eval_set=[(Xvalid, yvalid)],
                      verbose=50)
    
    # predict
    ypred_proba = model.predict_proba(Xvalid)
    ypred = model.predict(Xvalid)
    fold_logloss = log_loss(yvalid, ypred_proba)
    fold_metric = roc_auc_score(yvalid, ypred_proba[:,1])
    
    # save
    fold_loglosses.append(fold_logloss)
    fold_metrics.append(fold_metric)
    print(f"Fold {i+1} Log Loss: {fold_logloss:.5f}, AUC_ROC: {fold_metric:.5f}")


print(f"\nOverall Score, logloss: {np.mean(fold_loglosses):.5f}, auc: {np.mean(fold_metrics):.5f}")
summary_list = ['One Hot', str(Xtrain.shape[1]), str(round(np.mean(fold_loglosses),5)), str(round(np.mean(fold_metrics),5))]
for i, (k, v) in enumerate(summary_dict.items()):
    v.append(summary_list[i])


[0]	validation_0-logloss:0.26281	validation_0-auc:0.93563
[50]	validation_0-logloss:0.15150	validation_0-auc:0.96445
[100]	validation_0-logloss:0.14893	validation_0-auc:0.96570
[150]	validation_0-logloss:0.14796	validation_0-auc:0.96622
[177]	validation_0-logloss:0.14760	validation_0-auc:0.96639
Fold 1 Log Loss: 0.14759, AUC_ROC: 0.96640
[0]	validation_0-logloss:0.26467	validation_0-auc:0.93516
[50]	validation_0-logloss:0.15322	validation_0-auc:0.96408
[100]	validation_0-logloss:0.15011	validation_0-auc:0.96559
[150]	validation_0-logloss:0.14844	validation_0-auc:0.96646
[175]	validation_0-logloss:0.14831	validation_0-auc:0.96655
Fold 2 Log Loss: 0.14828, AUC_ROC: 0.96656
[0]	validation_0-logloss:0.26427	validation_0-auc:0.93490
[50]	validation_0-logloss:0.15416	validation_0-auc:0.96303
[100]	validation_0-logloss:0.15053	validation_0-auc:0.96495
[150]	validation_0-logloss:0.14922	validation_0-auc:0.96562
[174]	validation_0-logloss:0.14885	validation_0-auc:0.96582
Fold 3 Log Loss: 0.1488

Let's try target encoding. We need to do the target encoding inside the cv loop to avoid 
leakage

In [32]:
df_y = df_train[['y']].copy()
df_X = df_train.drop(columns=['y']).copy()


KFOLD = 5
kf = KFold(n_splits=KFOLD, shuffle=True, random_state=1337)

fold_loglosses = []
fold_metrics = []
for i,(train_index, valid_index) in enumerate(kf.split(df_X)):
    Xtrain = df_X.iloc[train_index]
    ytrain = df_y.iloc[train_index]
    Xvalid = df_X.iloc[valid_index]
    yvalid = df_y.iloc[valid_index]
    Xtest = df_test.copy()
    
    enc = TargetEncoder(cols=cats, 
                    min_samples_leaf=20, 
                    smoothing=10).fit(Xtrain, ytrain)
    Xtrain = enc.transform(Xtrain)
    Xvalid = enc.transform(Xvalid)
    Xtest = enc.transform(Xtest)

    # XGB    
    # Early stopping call back, use to get best model back
    es = xgb.callback.EarlyStopping(
    rounds=50,
    min_delta=1e-3,
    save_best=True,
    maximize=False,
    data_name="validation_0",
    metric_name="logloss",)
    
    model = XGBClassifier(tree_method='hist',
                          n_estimators=2000, 
                          objective='binary:logistic',
                          early_stopping_rounds=100, 
                          enable_categorical=True, 
                          eval_metric=['logloss', 'auc'],
                          n_jobs=4,
                          random_state=1337,
                          callbacks=[es],
                           
                          max_depth = 6,)
    
    model = model.fit(Xtrain, ytrain, 
                      eval_set=[(Xvalid, yvalid)],
                      verbose=50)
    
    # predict
    ypred_proba = model.predict_proba(Xvalid)
    ypred = model.predict(Xvalid)
    fold_logloss = log_loss(yvalid, ypred_proba)
    fold_metric = roc_auc_score(yvalid, ypred_proba[:,1])
    
    # save
    fold_loglosses.append(fold_logloss)
    fold_metrics.append(fold_metric)
    print(f"Fold {i+1} Log Loss: {fold_logloss:.5f}, AUC_ROC: {fold_metric:.5f}")
    

print(f"\nOverall Score, logloss: {np.mean(fold_loglosses):.5f}, auc: {np.mean(fold_metrics):.5f}")
summary_list = ['Target Encoding', str(Xtrain.shape[1]), str(round(np.mean(fold_loglosses),5)), str(round(np.mean(fold_metrics),5))]
for i, (k, v) in enumerate(summary_dict.items()):
    v.append(summary_list[i])


[0]	validation_0-logloss:0.25916	validation_0-auc:0.93935
[50]	validation_0-logloss:0.15113	validation_0-auc:0.96468
[100]	validation_0-logloss:0.14841	validation_0-auc:0.96604
[150]	validation_0-logloss:0.14698	validation_0-auc:0.96674
[175]	validation_0-logloss:0.14666	validation_0-auc:0.96691
Fold 1 Log Loss: 0.14666, AUC_ROC: 0.96691
[0]	validation_0-logloss:0.26054	validation_0-auc:0.93907
[50]	validation_0-logloss:0.15089	validation_0-auc:0.96520
[100]	validation_0-logloss:0.14842	validation_0-auc:0.96645
[150]	validation_0-logloss:0.14723	validation_0-auc:0.96705
[158]	validation_0-logloss:0.14716	validation_0-auc:0.96709
Fold 2 Log Loss: 0.14716, AUC_ROC: 0.96709
[0]	validation_0-logloss:0.26036	validation_0-auc:0.93847
[50]	validation_0-logloss:0.15241	validation_0-auc:0.96405
[100]	validation_0-logloss:0.14968	validation_0-auc:0.96540
[150]	validation_0-logloss:0.14844	validation_0-auc:0.96601
[181]	validation_0-logloss:0.14808	validation_0-auc:0.96620
Fold 3 Log Loss: 0.1480

Let's try weight of evidence encoding

In [33]:
df_y = df_train[['y']].copy()
df_X = df_train.drop(columns=['y']).copy()
Xtest = df_test.copy()

KFOLD = 5
kf = KFold(n_splits=KFOLD, shuffle=True, random_state=1337)

fold_loglosses = []
fold_metrics = []
for i,(train_index, valid_index) in enumerate(kf.split(df_X)):
    Xtrain = df_X.iloc[train_index]
    ytrain = df_y.iloc[train_index]
    Xvalid = df_X.iloc[valid_index]
    yvalid = df_y.iloc[valid_index]
    Xtest = df_test.copy()
    
    enc = WOEEncoder(cols=cats).fit(Xtrain, ytrain)
    Xtrain = enc.transform(Xtrain)
    Xvalid = enc.transform(Xvalid)
    Xtest = enc.transform(Xtest)

    # XGB    
    # Early stopping call back, use to get best model back
    es = xgb.callback.EarlyStopping(
    rounds=50,
    min_delta=1e-3,
    save_best=True,
    maximize=False,
    data_name="validation_0",
    metric_name="logloss",)
    
    model = XGBClassifier(tree_method='hist',
                          n_estimators=2000, 
                          objective='binary:logistic',
                          early_stopping_rounds=100, 
                          enable_categorical=True, 
                          eval_metric=['logloss', 'auc'],
                          n_jobs=4,
                          random_state=1337,
                          callbacks=[es],
                           
                          max_depth = 6,)
    
    model = model.fit(Xtrain, ytrain, 
                      eval_set=[(Xvalid, yvalid)],
                      verbose=50)
    
    # predict
    ypred_proba = model.predict_proba(Xvalid)
    ypred = model.predict(Xvalid)
    fold_logloss = log_loss(yvalid, ypred_proba)
    fold_metric = roc_auc_score(yvalid, ypred_proba[:,1])
    
    # save
    fold_loglosses.append(fold_logloss)
    fold_metrics.append(fold_metric)
    print(f"Fold {i+1} Log Loss: {fold_logloss:.5f}, AUC_ROC: {fold_metric:.5f}")


print(f"\nOverall Score, logloss: {np.mean(fold_loglosses):.5f}, auc: {np.mean(fold_metrics):.5f}")
summary_list = ['Weight of Evidence', str(Xtrain.shape[1]), str(round(np.mean(fold_loglosses),5)), str(round(np.mean(fold_metrics),5))]
for i, (k, v) in enumerate(summary_dict.items()):
    v.append(summary_list[i])


[0]	validation_0-logloss:0.25916	validation_0-auc:0.93935
[50]	validation_0-logloss:0.15113	validation_0-auc:0.96468
[100]	validation_0-logloss:0.14841	validation_0-auc:0.96604
[150]	validation_0-logloss:0.14698	validation_0-auc:0.96674
[175]	validation_0-logloss:0.14666	validation_0-auc:0.96691
Fold 1 Log Loss: 0.14666, AUC_ROC: 0.96691
[0]	validation_0-logloss:0.26054	validation_0-auc:0.93907
[50]	validation_0-logloss:0.15089	validation_0-auc:0.96520
[100]	validation_0-logloss:0.14842	validation_0-auc:0.96645
[150]	validation_0-logloss:0.14723	validation_0-auc:0.96705
[158]	validation_0-logloss:0.14716	validation_0-auc:0.96709
Fold 2 Log Loss: 0.14716, AUC_ROC: 0.96709
[0]	validation_0-logloss:0.26036	validation_0-auc:0.93847
[50]	validation_0-logloss:0.15190	validation_0-auc:0.96429
[100]	validation_0-logloss:0.14949	validation_0-auc:0.96552
[150]	validation_0-logloss:0.14832	validation_0-auc:0.96613
[179]	validation_0-logloss:0.14799	validation_0-auc:0.96629
Fold 3 Log Loss: 0.1479

In [34]:
df_summary = pd.DataFrame.from_dict(summary_dict)
display(df_summary)

Unnamed: 0,Coding Algo,Number of Features,Average Logloss,Average ROC-AUC
0,Label,16,0.14711,0.96677
1,One Hot,42,0.14774,0.96646
2,Target Encoding,16,0.14696,0.96685
3,Weight of Evidence,16,0.14694,0.96687
