In [1]:
#Shubham Tribedi | 1811100002037

## Comparison of Categorical Variable Encodings

In this lecture, we will compare the performance of the different feature categorical encoding techniques we learned so far.

We will compare:

- One hot encoding
- Replacing labels by the count
- Ordering labels according to target
- Mean Encoding
- WoE

Using the titanic dataset

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from pandas.api.types import is_numeric_dtype
from sklearn.metrics import roc_auc_score
from feature_engine.encoding import *

In [4]:
# let's load the titanic dataset

# we will only use these columns in the demo
cols = ['pclass', 'age', 'sibsp', 'parch', 'fare',
        'sex', 'cabin', 'embarked', 'survived']

data = pd.read_csv('titanic.csv', usecols=cols)

data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,female,29.0,0,0,211.3375,B5,S
1,1,1,male,0.9167,1,2,151.55,C22,S
2,1,0,female,2.0,1,2,151.55,C22,S
3,1,0,male,30.0,1,2,151.55,C22,S
4,1,0,female,25.0,1,2,151.55,C22,S


In [5]:
# let's check for missing data

data.isnull().sum()

pclass         0
survived       0
sex            0
age          263
sibsp          0
parch          0
fare           1
cabin       1014
embarked       2
dtype: int64

In [6]:
# Drop observations with NA in Fare and embarked
data.dropna(axis=0,how='any',inplace=True,subset=['fare','embarked','age','cabin'])

In [7]:
data.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
cabin       0
embarked    0
dtype: int64

In [8]:
# Now we extract the first letter of the cabin
data['cabin'] = data['cabin'].str[0]
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,female,29.0,0,0,211.3375,B,S
1,1,1,male,0.9167,1,2,151.55,C,S
2,1,0,female,2.0,1,2,151.55,C,S
3,1,0,male,30.0,1,2,151.55,C,S
4,1,0,female,25.0,1,2,151.55,C,S


In [9]:
# drop observations with cabin = T, they are too few
data=data[data.cabin!='T']

In [10]:
# Let's divide into train and test set
X_train,X_test,Y_train,Y_test=train_test_split(data.loc[:,data.columns!='survived'],data.loc[:,data.columns=='survived'],test_size=0.3)
X_train.shape, X_test.shape

((188, 8), (81, 8))

In [11]:
# Let's replace null values in numerical variables by the mean
for i in X_train:
    if is_numeric_dtype(X_train[i])==True:
        
        X_train[i].fillna(X_train[i].mean(),inplace=True)
        X_test[i].fillna(X_test[i].mean(),inplace=True)

In [12]:
data['cabin'].unique()

array(['B', 'C', 'E', 'D', 'A', 'F', 'G'], dtype=object)

In [13]:
# let's check that we have no missing data after NA imputation
data.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
cabin       0
embarked    0
dtype: int64

In [14]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
206,1,male,44.0,2,0,90.0000,C,Q
10,1,male,47.0,1,0,227.5250,C,C
275,1,female,40.0,1,1,134.5000,E,C
65,1,female,33.0,1,0,53.1000,E,S
252,1,male,61.0,1,3,262.3750,B,C
...,...,...,...,...,...,...,...,...
178,1,female,45.0,1,0,52.5542,D,S
26,1,male,25.0,1,0,91.0792,B,C
135,1,male,71.0,0,0,34.6542,A,C
286,1,female,63.0,1,0,221.7792,C,S


### One Hot Encoding

In [15]:
X_train_OHE=OneHotEncoder().fit(X_train,Y_train).transform(X_train)

X_train_OHE.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,sex_female,cabin_C,cabin_E,cabin_B,cabin_D,cabin_F,cabin_A,cabin_G,embarked_Q,embarked_C,embarked_S
206,1,44.0,2,0,90.0,1,0,1,0,0,0,0,0,0,1,0,0
10,1,47.0,1,0,227.525,1,0,1,0,0,0,0,0,0,0,1,0
275,1,40.0,1,1,134.5,0,1,0,1,0,0,0,0,0,0,1,0
65,1,33.0,1,0,53.1,0,1,0,1,0,0,0,0,0,0,0,1
252,1,61.0,1,3,262.375,1,0,0,0,1,0,0,0,0,0,1,0


In [16]:
X_test_OHE=OneHotEncoder().fit(X_train,Y_train).transform(X_test)

X_test_OHE.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,sex_female,cabin_C,cabin_E,cabin_B,cabin_D,cabin_F,cabin_A,cabin_G,embarked_Q,embarked_C,embarked_S
165,1,38.0,1,0,90.0,1,0,1,0,0,0,0,0,0,0,0,1
103,1,38.0,0,0,227.525,0,1,1,0,0,0,0,0,0,0,1,0
270,1,18.0,1,0,60.0,0,1,1,0,0,0,0,0,0,0,0,1
202,1,36.0,0,0,26.2875,1,0,0,1,0,0,0,0,0,0,0,1
24,1,29.0,0,0,221.7792,0,1,1,0,0,0,0,0,0,0,0,1


### Count encoding

In [17]:
X_train_count=CountFrequencyEncoder().fit(X_train,Y_train).transform(X_train)

X_train_count.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
206,1,100,44.0,2,0,90.0,56,2
10,1,100,47.0,1,0,227.525,56,78
275,1,88,40.0,1,1,134.5,28,78
65,1,88,33.0,1,0,53.1,28,108
252,1,100,61.0,1,3,262.375,44,78


In [18]:
X_test_count=CountFrequencyEncoder().fit(X_train,Y_train).transform(X_test)

X_test_count.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
165,1,100,38.0,1,0,90.0,56,108
103,1,88,38.0,0,0,227.525,56,78
270,1,88,18.0,1,0,60.0,56,108
202,1,100,36.0,0,0,26.2875,28,108
24,1,88,29.0,0,0,221.7792,56,108


### Ordered Integer Encoding

In [19]:
X_train_ordered=OrdinalEncoder().fit(X_train,Y_train).transform(X_train)

X_train_ordered.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
206,1,0,44.0,2,0,90.0,1,0
10,1,0,47.0,1,0,227.525,1,2
275,1,1,40.0,1,1,134.5,4,2
65,1,1,33.0,1,0,53.1,4,1
252,1,0,61.0,1,3,262.375,5,2


In [20]:
X_test_ordered=OrdinalEncoder().fit(X_train,Y_train).transform(X_test)

X_test_ordered.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
165,1,0,38.0,1,0,90.0,1,1
103,1,1,38.0,0,0,227.525,1,2
270,1,1,18.0,1,0,60.0,1,1
202,1,0,36.0,0,0,26.2875,4,1
24,1,1,29.0,0,0,221.7792,1,1


### Mean Encoding

In [21]:
X_train_mean=MeanEncoder().fit(X_train,Y_train).transform(X_train)

X_train_mean.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
206,1,0.42,44.0,2,0,90.0,0.553571,0.5
10,1,0.42,47.0,1,0,227.525,0.553571,0.730769
275,1,0.920455,40.0,1,1,134.5,0.714286,0.730769
65,1,0.920455,33.0,1,0,53.1,0.714286,0.601852
252,1,0.42,61.0,1,3,262.375,0.727273,0.730769


In [22]:
X_test_mean=MeanEncoder().fit(X_train,Y_train).transform(X_test)

X_test_mean.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
165,1,0.42,38.0,1,0,90.0,0.553571,0.601852
103,1,0.920455,38.0,0,0,227.525,0.553571,0.730769
270,1,0.920455,18.0,1,0,60.0,0.553571,0.601852
202,1,0.42,36.0,0,0,26.2875,0.714286,0.601852
24,1,0.920455,29.0,0,0,221.7792,0.553571,0.601852


### Probability Ratio

In [23]:
X_train_ratio=PRatioEncoder(encoding_method='ratio').fit(X_train,Y_train['survived']).transform(X_train)
X_test_ratio=PRatioEncoder(encoding_method='ratio').fit(X_train,Y_train['survived']).transform(X_test)
X_train_ratio.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
206,1,0.724138,44.0,2,0,90.0,1.24,1.0
10,1,0.724138,47.0,1,0,227.525,1.24,2.714286
275,1,11.571429,40.0,1,1,134.5,2.5,2.714286
65,1,11.571429,33.0,1,0,53.1,2.5,1.511628
252,1,0.724138,61.0,1,3,262.375,2.666667,2.714286


### Random Forest Performance

In [24]:
# create a function to build random forests (n_estimators=50, random_state=39, max_depth=3) and compare performance in train and test set
def run_randomForests(X_train,X_test,Y_train,Y_test):
    rfc = RandomForestClassifier(n_estimators=50, random_state=39, max_depth=3)
    rfc.fit(X_train,Y_train['survived'])
    print("Train set")
    print("Random Forests roc-auc:",roc_auc_score(Y_train,rfc.predict(X_train)))
    print("Test set")
    print("Random Forests roc-auc:",roc_auc_score(Y_test,rfc.predict(X_test)))


In [25]:
X_train_OHE

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,sex_female,cabin_C,cabin_E,cabin_B,cabin_D,cabin_F,cabin_A,cabin_G,embarked_Q,embarked_C,embarked_S
206,1,44.0,2,0,90.0000,1,0,1,0,0,0,0,0,0,1,0,0
10,1,47.0,1,0,227.5250,1,0,1,0,0,0,0,0,0,0,1,0
275,1,40.0,1,1,134.5000,0,1,0,1,0,0,0,0,0,0,1,0
65,1,33.0,1,0,53.1000,0,1,0,1,0,0,0,0,0,0,0,1
252,1,61.0,1,3,262.3750,1,0,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,1,45.0,1,0,52.5542,0,1,0,0,0,1,0,0,0,0,0,1
26,1,25.0,1,0,91.0792,1,0,0,0,1,0,0,0,0,0,1,0
135,1,71.0,0,0,34.6542,1,0,0,0,0,0,0,1,0,0,1,0
286,1,63.0,1,0,221.7792,0,1,1,0,0,0,0,0,0,0,0,1


In [26]:
# OHE
run_randomForests(X_train_OHE, X_test_OHE, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.8259537210756722
Test set
Random Forests roc-auc: 0.7456140350877194


In [27]:
# counts
run_randomForests(X_train_count, X_test_count, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7779237023139463
Test set
Random Forests roc-auc: 0.7302631578947367


In [28]:
# ordered labels
run_randomForests(X_train_ordered, X_test_ordered, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7851782363977485
Test set
Random Forests roc-auc: 0.6798245614035088


In [29]:
# mean encoding
run_randomForests(X_train_mean, X_test_mean, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7851782363977485
Test set
Random Forests roc-auc: 0.6798245614035088


In [30]:
# ratio
run_randomForests(X_train_ratio, X_test_ratio, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7851782363977485
Test set
Random Forests roc-auc: 0.6798245614035088


Comparing the roc_auc values on the test sets, we can see that one hot encoding has the worse performance. This makes sense because trees do not perform well in datasets with big feature spaces.

The remaining encodings returned similar performances. This also makes sense, because trees are non-linear models, so target guided encodings may not necessarily improve the model performance

### Logistic Regression Performance

In [37]:
# create a function for Logistic Regression
def run_logistic(X_train,X_test,Y_train,Y_test):
    rfc = LogisticRegression()
    rfc.fit(X_train,Y_train['survived'])
    print("Train set")
    print("Logistic Regression roc-auc:",roc_auc_score(Y_train,rfc.predict(X_train)))
    print("Test set")
    print("Logistic Regression roc-auc:",roc_auc_score(Y_test,rfc.predict(X_test)))


In [38]:
# OHE
run_logistic(X_train_OHE, X_test_OHE, Y_train, Y_test)

Train set
Logistic Regression roc-auc: 0.7956222639149468
Test set
Logistic Regression roc-auc: 0.7664473684210525
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
# counts
run_logistic(X_train_count, X_test_count, Y_train, Y_test)

Train set
Logistic Regression roc-auc: 0.7114446529080675
Test set
Logistic Regression roc-auc: 0.7631578947368421
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
# ordered labels
run_logistic(X_train_ordered, X_test_ordered, Y_train, Y_test)

Train set
Logistic Regression roc-auc: 0.7766103814884302
Test set
Logistic Regression roc-auc: 0.736842105263158
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
# mean encoding
run_logistic(X_train_mean, X_test_mean, Y_train, Y_test)

Train set
Logistic Regression roc-auc: 0.7118824265165729
Test set
Logistic Regression roc-auc: 0.7478070175438595
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
# ratio
run_logistic(X_train_ratio, X_test_ratio, Y_train, Y_test)

Train set
Logistic Regression roc-auc: 0.7874921826141339
Test set
Logistic Regression roc-auc: 0.7576754385964911
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


For Logistic regression, the best performances are obtained with one hot encoding, as it preserves linear relationships with variables and target, and also with weight of evidence, and ordered encoding.

Note however how count encoding, returns the worse performance as it does not create a monotonic relationship between variables and target, and in this case, mean target encoding is probably causing over-fitting.