In [1]:
import os 
import ast
import re

import pandas as pd 
import numpy as np
from tabulate import tabulate
import lightgbm
import category_encoders

In [2]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.backward_difference import BackwardDifferenceEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder

# Config 

In [None]:
results_path = "../results/"

# Results

In [3]:
dataset_list = [
    "telecom", "adult", 
    "employee", "credit", "mortgages", 
    "promotion", "kick", "kdd_upselling", "taxi", 
    "poverty_A","poverty_B", "poverty_C",
]

In [4]:
def read_info(path):
    with open(path, "r") as f:
        a = f.read()
    dic = ast.literal_eval(a)["info"]
    print(path, dic["train_shape"][0]+dic["test_shape"][0], dic["train_shape"], dic["test_shape"], dic["num_cat_cols"])
    return dic

In [5]:
for ds_name in dataset_list:
    _ = read_info(os.path.join(results_path, f"exp_1_{ds_name}_r.txt"))

./results/exp_1_telecom_r.txt 7043 (4225, 20) (2818, 20) 16
./results/exp_1_adult_r.txt 48842 (29305, 15) (19537, 15) 8
./results/exp_1_employee_r.txt 32769 (19661, 10) (13108, 10) 9
./results/exp_1_credit_r.txt 307511 (184506, 121) (123005, 121) 18
./results/exp_1_mortgages_r.txt 45642 (27385, 20) (18257, 20) 9
./results/exp_1_promotion_r.txt 54808 (32884, 13) (21924, 13) 5
./results/exp_1_kick_r.txt 72983 (43789, 32) (29194, 32) 19
./results/exp_1_kdd_upselling_r.txt 50000 (30000, 231) (20000, 231) 40
./results/exp_1_taxi_r.txt 892557 (535534, 8) (357023, 8) 5
./results/exp_1_poverty_A_r.txt 37560 (22536, 41) (15024, 41) 38
./results/exp_1_poverty_B_r.txt 20252 (12151, 224) (8101, 224) 191
./results/exp_1_poverty_C_r.txt 29913 (17947, 41) (11966, 41) 35


In [6]:
def read_dict(path, val_type="None"):
    with open(path, "r") as f:
        a = f.read()

    dic = ast.literal_eval(a)
    res = {}
    for k in dic.keys():
        if k != "info":
            res[k] = dic[k]["test_score"]
    dataframe = pd.DataFrame.from_dict(res, orient='index', columns=[val_type])
    return dataframe

In [7]:
def _f(exp="exp_1", dec=3):
    res = []
    cols = []
    for ds_name in dataset_list:
        cols.append(ds_name)
        d1 = read_dict(os.path.join(results_path, f"{exp}_{ds_name}_r.txt"))
    
        res.append(d1)

    res = pd.concat(res, axis=1)
    res.columns = cols
    if dec != -1:
        res = np.round(res, dec)
    res = res.sort_index()
    
    indx = []
    for d in res.index:
        indx.append(re.sub(r'[^\w\s]', '', d)) 
    res.index = indx
    return res

r_None = _f(exp="exp_1", dec=-1)
r_Single = _f(exp="exp_2", dec=-1)
r_Double = _f(exp="exp_3", dec=-1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


In [8]:
max_scores = pd.concat([r_None, r_Single, r_Double], axis=0)
max_scores = np.max(max_scores)
max_scores

telecom          0.840515
adult            0.929890
employee         0.853749
credit           0.760054
mortgages        0.702734
promotion        0.908301
kick             0.790890
kdd_upselling    0.865361
taxi             0.603382
poverty_A        0.742915
poverty_B        0.693057
poverty_C        0.742004
dtype: float64

In [9]:
min_scores = pd.concat([r_None, r_Single, r_Double], axis=0)
min_scores = np.min(min_scores)
min_scores

telecom          0.500000
adult            0.518181
employee         0.498967
credit           0.495732
mortgages        0.499965
promotion        0.540260
kick             0.468199
kdd_upselling    0.500000
taxi             0.499984
poverty_A        0.472994
poverty_B        0.434025
poverty_C        0.452673
dtype: float64

In [10]:
r_None_scaled = (r_None - min_scores) / (max_scores - min_scores)
r_Single_scaled = (r_Single - min_scores) / (max_scores - min_scores)
r_Double_scaled = (r_Double - min_scores) / (max_scores - min_scores)

In [11]:
np.max(r_None)

telecom          0.840471
adult            0.929659
employee         0.830000
credit           0.760054
mortgages        0.700131
promotion        0.907945
kick             0.790748
kdd_upselling    0.864319
taxi             0.565624
poverty_A        0.735490
poverty_B        0.645296
poverty_C        0.737249
dtype: float64

In [12]:
np.max(r_Single)

telecom          0.840427
adult            0.929890
employee         0.849834
credit           0.759940
mortgages        0.702734
promotion        0.908301
kick             0.790249
kdd_upselling    0.865361
taxi             0.603382
poverty_A        0.742915
poverty_B        0.690204
poverty_C        0.742004
dtype: float64

In [13]:
np.max(r_Double)

telecom          0.840515
adult            0.929587
employee         0.853749
credit           0.759915
mortgages        0.698062
promotion        0.906287
kick             0.790890
kdd_upselling    0.864302
taxi             0.603292
poverty_A        0.741768
poverty_B        0.693057
poverty_C        0.736608
dtype: float64

# Best encoder 

In [14]:
a = r_None_scaled.mean(axis=1).sort_values(ascending=False).round(4)
print(tabulate(pd.DataFrame(a), tablefmt="pipe", headers="keys"))

|                           |      0 |
|:--------------------------|-------:|
| HelmertEncoder            | 0.9517 |
| SumEncoder                | 0.9434 |
| FrequencyEncoder          | 0.9176 |
| CatBoostEncoderShuffle    | 0.6815 |
| CatBoostEncoder           | 0.573  |
| TargetEncoder             | 0.5176 |
| JamesSteinEncoder         | 0.5164 |
| OrdinalEncoder            | 0.4966 |
| WOEEncoder                | 0.4907 |
| MEstimateEncoder          | 0.4503 |
| BackwardDifferenceEncoder | 0.413  |
| LeaveOneOutEncoder        | 0.0698 |


In [15]:
a = r_Single_scaled.mean(axis=1).sort_values(ascending=False).round(4)
print(tabulate(pd.DataFrame(a), tablefmt="pipe", headers="keys"))

|                           |      0 |
|:--------------------------|-------:|
| CatBoostEncoder           | 0.9726 |
| OrdinalEncoder            | 0.9694 |
| HelmertEncoder            | 0.9558 |
| SumEncoder                | 0.9434 |
| WOEEncoder                | 0.9326 |
| FrequencyEncoder          | 0.9315 |
| BackwardDifferenceEncoder | 0.9109 |
| TargetEncoder             | 0.8915 |
| JamesSteinEncoder         | 0.8556 |
| MEstimateEncoder          | 0.8189 |
| CatBoostEncoderShuffle    | 0.6961 |
| LeaveOneOutEncoder        | 0.073  |


In [16]:
a = r_Double_scaled.mean(axis=1).sort_values(ascending=False).round(4)
print(tabulate(pd.DataFrame(a), tablefmt="pipe", headers="keys"))

|                        |      0 |
|:-----------------------|-------:|
| JamesSteinEncoder      | 0.9918 |
| CatBoostEncoder        | 0.9917 |
| TargetEncoder          | 0.9916 |
| LeaveOneOutEncoder     | 0.9909 |
| WOEEncoder             | 0.9838 |
| MEstimateEncoder       | 0.9686 |
| FrequencyEncoder       | 0.8019 |
| CatBoostEncoderShuffle | 0.6656 |


# Influence of validation 

In [17]:
r_None.round(4).fillna("")
print(tabulate(r_None.round(4).fillna(""), tablefmt="pipe", headers="keys"))

|                           |   telecom |   adult |   employee |   credit |   mortgages |   promotion | kick   | kdd_upselling   | taxi   |   poverty_A |   poverty_B |   poverty_C |
|:--------------------------|----------:|--------:|-----------:|---------:|------------:|------------:|:-------|:----------------|:-------|------------:|------------:|------------:|
| BackwardDifferenceEncoder |    0.6454 |  0.8555 |     0.5006 |   0.7442 |      0.5997 |      0.6482 |        |                 |        |      0.5149 |      0.5484 |      0.4945 |
| CatBoostEncoder           |    0.7666 |  0.868  |     0.5004 |   0.7478 |      0.6279 |      0.7811 | 0.6583 | 0.8549          | 0.5477 |      0.5179 |      0.5638 |      0.5427 |
| CatBoostEncoderShuffle    |    0.8177 |  0.878  |     0.5004 |   0.7527 |      0.6595 |      0.8387 | 0.6708 | 0.8558          | 0.5524 |      0.5788 |      0.6453 |      0.5916 |
| FrequencyEncoder          |    0.8405 |  0.9291 |     0.807  |   0.7593 |      0.6949 | 

In [18]:
r_Single.round(4).fillna("")
print(tabulate(r_Single.round(4).fillna(""), tablefmt="pipe", headers="keys"))

|                           |   telecom |   adult |   employee |   credit |   mortgages |   promotion | kick   | kdd_upselling   | taxi   |   poverty_A |   poverty_B |   poverty_C |
|:--------------------------|----------:|--------:|-----------:|---------:|------------:|------------:|:-------|:----------------|:-------|------------:|------------:|------------:|
| BackwardDifferenceEncoder |    0.8382 |  0.9293 |     0.7569 |   0.7595 |      0.6894 |      0.9064 |        |                 |        |      0.7323 |      0.6151 |      0.7108 |
| CatBoostEncoder           |    0.8392 |  0.9292 |     0.8498 |   0.7594 |      0.6951 |      0.8918 | 0.7901 | 0.8654          | 0.5844 |      0.7429 |      0.6902 |      0.7333 |
| CatBoostEncoderShuffle    |    0.8203 |  0.8781 |     0.499  |   0.753  |      0.6607 |      0.8394 | 0.6746 | 0.8552          | 0.5553 |      0.6007 |      0.6449 |      0.6046 |
| FrequencyEncoder          |    0.8392 |  0.9293 |     0.8138 |   0.7592 |      0.6937 | 

In [19]:
r_Double.round(4).fillna("")
print(tabulate(r_Double.round(4).fillna(""), tablefmt="pipe", headers="keys"))

|                        |   telecom |   adult |   employee |   credit |   mortgages |   promotion | kick   | kdd_upselling   | taxi   |   poverty_A |   poverty_B |   poverty_C |
|:-----------------------|----------:|--------:|-----------:|---------:|------------:|------------:|:-------|:----------------|:-------|------------:|------------:|------------:|
| CatBoostEncoder        |    0.8394 |  0.9293 |     0.8529 |   0.7592 |      0.6967 |      0.9056 | 0.7899 | 0.8633          | 0.6031 |      0.7418 |      0.6902 |      0.7343 |
| CatBoostEncoderShuffle |    0.8233 |  0.8859 |     0.5321 |   0.7525 |      0.6236 |      0.8398 |        |                 |        |      0.6066 |      0.5815 |      0.6248 |
| FrequencyEncoder       |    0.8371 |  0.9221 |     0.5563 |   0.755  |      0.6582 |      0.8749 | 0.7655 | 0.8551          | 0.5657 |      0.6873 |      0.6037 |      0.6961 |
| JamesSteinEncoder      |    0.8398 |  0.9296 |     0.8489 |   0.7598 |      0.6981 |      0.905  | 0.79

In [20]:
single_none = np.round(((r_Single - r_None) / r_None * 100), 1)
single_none
# print(tabulate(single_none, tablefmt="pipe", headers="keys"))

Unnamed: 0,telecom,adult,employee,credit,mortgages,promotion,kick,kdd_upselling,taxi,poverty_A,poverty_B,poverty_C
BackwardDifferenceEncoder,29.9,8.6,51.2,2.1,15.0,39.8,,,,42.2,12.2,43.7
CatBoostEncoder,9.5,7.0,69.8,1.6,10.7,14.2,20.0,1.2,6.7,43.4,22.4,35.1
CatBoostEncoderShuffle,0.3,0.0,-0.3,0.0,0.2,0.1,0.6,-0.1,0.5,3.8,-0.1,2.2
FrequencyEncoder,-0.2,0.0,0.8,-0.0,-0.2,0.0,-0.1,-0.1,2.9,0.4,-0.6,0.3
HelmertEncoder,0.0,0.0,0.5,-0.0,0.4,0.0,,,,-0.4,0.5,0.4
JamesSteinEncoder,16.6,6.9,56.2,1.5,10.3,13.4,-11.5,-14.8,8.6,48.5,27.5,49.2
LeaveOneOutEncoder,0.0,-0.6,-1.8,0.8,0.0,-1.0,-6.9,0.0,-0.0,1.9,-0.0,9.5
MEstimateEncoder,20.9,7.8,47.1,3.0,14.3,11.0,-10.0,-29.5,16.8,39.0,49.6,56.3
OrdinalEncoder,13.4,7.9,65.2,1.9,15.1,27.4,19.6,0.2,9.7,55.1,41.7,32.2
SumEncoder,0.0,0.0,0.0,-0.0,0.0,0.0,,,,-0.0,0.0,-0.0


In [21]:
a = pd.DataFrame(np.mean(single_none, axis=1))
a.columns = ["None -> Single"]
print(tabulate(a, tablefmt="pipe", headers="keys"))

|                           |   None -> Single |
|:--------------------------|-----------------:|
| BackwardDifferenceEncoder |        27.1889   |
| CatBoostEncoder           |        20.1333   |
| CatBoostEncoderShuffle    |         0.6      |
| FrequencyEncoder          |         0.266667 |
| HelmertEncoder            |         0.155556 |
| JamesSteinEncoder         |        17.7      |
| LeaveOneOutEncoder        |         0.158333 |
| MEstimateEncoder          |        18.8583   |
| OrdinalEncoder            |        24.1167   |
| SumEncoder                |         0        |
| TargetEncoder             |        19.6      |
| WOEEncoder                |        23.375    |


In [22]:
double_single = np.round(((r_Double - r_Single) / r_Single * 100), 1)
double_single
# print(tabulate(double_single, tablefmt="pipe", headers="keys"))

Unnamed: 0,telecom,adult,employee,credit,mortgages,promotion,kick,kdd_upselling,taxi,poverty_A,poverty_B,poverty_C
BackwardDifferenceEncoder,,,,,,,,,,,,
CatBoostEncoder,0.0,0.0,0.4,-0.0,0.2,1.6,-0.0,-0.2,3.2,-0.2,0.0,0.1
CatBoostEncoderShuffle,0.4,0.9,6.6,-0.1,-5.6,0.1,,,,1.0,-9.8,3.3
FrequencyEncoder,-0.2,-0.8,-31.6,-0.5,-5.1,-3.4,-3.1,-1.0,-2.8,-5.9,-1.5,-3.3
HelmertEncoder,,,,,,,,,,,,
JamesSteinEncoder,0.1,0.0,8.6,0.0,4.7,-0.0,35.4,18.9,2.3,1.5,1.9,2.1
LeaveOneOutEncoder,67.9,79.4,38.8,52.0,39.3,67.6,68.8,72.7,20.4,45.3,38.6,48.1
MEstimateEncoder,0.1,0.0,10.5,0.0,-0.3,0.1,34.1,45.0,0.6,1.0,4.7,1.8
OrdinalEncoder,,,,,,,,,,,,
SumEncoder,,,,,,,,,,,,


In [23]:
b = pd.DataFrame(np.mean(double_single, axis=1))
b.columns = ["Single -> Double"]
print(tabulate(b, tablefmt="pipe", headers="keys"))

|                           |   Single -> Double |
|:--------------------------|-------------------:|
| BackwardDifferenceEncoder |         nan        |
| CatBoostEncoder           |           0.425    |
| CatBoostEncoderShuffle    |          -0.355556 |
| FrequencyEncoder          |          -4.93333  |
| HelmertEncoder            |         nan        |
| JamesSteinEncoder         |           6.29167  |
| LeaveOneOutEncoder        |          53.2417   |
| MEstimateEncoder          |           8.13333  |
| OrdinalEncoder            |         nan        |
| SumEncoder                |         nan        |
| TargetEncoder             |           4.20833  |
| WOEEncoder                |           1.875    |


In [24]:
res = pd.concat([a,b],axis=1)
print(tabulate(res.round(1).fillna(""), tablefmt="pipe", headers="keys"))

|                           |   None -> Single | Single -> Double   |
|:--------------------------|-----------------:|:-------------------|
| BackwardDifferenceEncoder |             27.2 |                    |
| CatBoostEncoder           |             20.1 | 0.4                |
| CatBoostEncoderShuffle    |              0.6 | -0.4               |
| FrequencyEncoder          |              0.3 | -4.9               |
| HelmertEncoder            |              0.2 |                    |
| JamesSteinEncoder         |             17.7 | 6.3                |
| LeaveOneOutEncoder        |              0.2 | 53.2               |
| MEstimateEncoder          |             18.9 | 8.1                |
| OrdinalEncoder            |             24.1 |                    |
| SumEncoder                |              0   |                    |
| TargetEncoder             |             19.6 | 4.2                |
| WOEEncoder                |             23.4 | 1.9                |


In [25]:
np.mean(res)

None -> Single      12.679398
Single -> Double     8.610764
dtype: float64

# Top scores improvement 

In [26]:
a = np.max(r_Single) - np.max(r_None)
a = pd.DataFrame(a) * 100
a.columns = ["None -> Single"]
print(tabulate(a, tablefmt="pipe", headers="keys"))

|               |   None -> Single |
|:--------------|-----------------:|
| telecom       |      -0.00440061 |
| adult         |       0.0230934  |
| employee      |       1.98344    |
| credit        |      -0.0113489  |
| mortgages     |       0.260293   |
| promotion     |       0.0355479  |
| kick          |      -0.0498629  |
| kdd_upselling |       0.104221   |
| taxi          |       3.77584    |
| poverty_A     |       0.742493   |
| poverty_B     |       4.49078    |
| poverty_C     |       0.475514   |


In [27]:
b = np.max(r_Double) - np.max(r_Single)
b = pd.DataFrame(b) * 100
b.columns = ["Single -> Double"]
print(tabulate(b, tablefmt="pipe", headers="keys"))

|               |   Single -> Double |
|:--------------|-------------------:|
| telecom       |         0.00880122 |
| adult         |        -0.0303435  |
| employee      |         0.391465   |
| credit        |        -0.00252879 |
| mortgages     |        -0.467158   |
| promotion     |        -0.201342   |
| kick          |         0.064122   |
| kdd_upselling |        -0.105947   |
| taxi          |        -0.0089831  |
| poverty_A     |        -0.114748   |
| poverty_B     |         0.285318   |
| poverty_C     |        -0.539663   |


In [28]:
res = pd.concat([a,b],axis=1)
print(tabulate(res.round(2).fillna(""), tablefmt="pipe", headers="keys"))

|               |   None -> Single |   Single -> Double |
|:--------------|-----------------:|-------------------:|
| telecom       |            -0    |               0.01 |
| adult         |             0.02 |              -0.03 |
| employee      |             1.98 |               0.39 |
| credit        |            -0.01 |              -0    |
| mortgages     |             0.26 |              -0.47 |
| promotion     |             0.04 |              -0.2  |
| kick          |            -0.05 |               0.06 |
| kdd_upselling |             0.1  |              -0.11 |
| taxi          |             3.78 |              -0.01 |
| poverty_A     |             0.74 |              -0.11 |
| poverty_B     |             4.49 |               0.29 |
| poverty_C     |             0.48 |              -0.54 |


In [29]:
np.mean(res)

None -> Single      0.985467
Single -> Double   -0.060084
dtype: float64

# Encoders examples

In [30]:
# http://latex2png.com/ 200

In [31]:
df_train = pd.DataFrame({})
df_train["category"] = ["A", "A", "A", "A", "B", "B", "B", "C", "C", "D"]
df_train["category_representation"] = df_train["category"]
df_train["target"] = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]

df_test = pd.DataFrame({})
df_test["category"] = ["A", "B", "C", "D", "NewCategory"]
df_test["category_representation"] = df_test["category"]
df_test["target"] = None

In [32]:
df_train.drop("category_representation", axis=1)

Unnamed: 0,category,target
0,A,0
1,A,1
2,A,0
3,A,1
4,B,0
5,B,1
6,B,0
7,C,1
8,C,0
9,D,1


In [33]:
df_test.drop("category_representation", axis=1)

Unnamed: 0,category,target
0,A,
1,B,
2,C,
3,D,
4,NewCategory,


In [34]:
encoder = OrdinalEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,1,0
1,A,1,1
2,A,1,0
3,A,1,1
4,B,2,0
5,B,2,1
6,B,2,0
7,C,3,1
8,C,3,0
9,D,4,1


Unnamed: 0,category,category_representation,target
0,A,1.0,
1,B,2.0,
2,C,3.0,
3,D,4.0,
4,NewCategory,-1.0,


In [35]:
encoder = OneHotEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation_1,category_representation_2,category_representation_3,category_representation_4,target
0,A,1,0,0,0,0
1,A,1,0,0,0,1
2,A,1,0,0,0,0
3,A,1,0,0,0,1
4,B,0,1,0,0,0
5,B,0,1,0,0,1
6,B,0,1,0,0,0
7,C,0,0,1,0,1
8,C,0,0,1,0,0
9,D,0,0,0,1,1


Unnamed: 0,category,category_representation_1,category_representation_2,category_representation_3,category_representation_4,target
0,A,1,0,0,0,
1,B,0,1,0,0,
2,C,0,0,1,0,
3,D,0,0,0,1,
4,NewCategory,0,0,0,0,


In [36]:
class FrequencyEncoder:
    def __init__(self, cols):
        self.cols = cols
        self.counts_dict = None

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        counts_dict = {}
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict[col] = dict(zip(values, counts))
        self.counts_dict = counts_dict

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        counts_dict_test = {}
        res = []
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict_test[col] = dict(zip(values, counts))

            # if value is in "train" keys - replace "test" counts with "train" counts
            for k in [key for key in counts_dict_test[col].keys() if key in self.counts_dict[col].keys()]:
                counts_dict_test[col][k] = self.counts_dict[col][k]

            res.append(X[col].map(counts_dict_test[col]).values.reshape(-1, 1))
        res = np.hstack(res)

        X[self.cols] = res
        return X

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        self.fit(X, y)
        X = self.transform(X)
        return X

In [37]:
encoder = FrequencyEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,4,0
1,A,4,1
2,A,4,0
3,A,4,1
4,B,3,0
5,B,3,1
6,B,3,0
7,C,2,1
8,C,2,0
9,D,1,1


Unnamed: 0,category,category_representation,target
0,A,4,
1,B,3,
2,C,2,
3,D,1,
4,NewCategory,1,


In [38]:
encoder = TargetEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.5,0
1,A,0.5,1
2,A,0.5,0
3,A,0.5,1
4,B,0.3532,0
5,B,0.3532,1
6,B,0.3532,0
7,C,0.5,1
8,C,0.5,0
9,D,0.5,1


Unnamed: 0,category,category_representation,target
0,A,0.5,
1,B,0.3532,
2,C,0.5,
3,D,0.5,
4,NewCategory,0.5,


In [39]:
encoder = WOEEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.0,0
1,A,0.0,1
2,A,0.0,0
3,A,0.0,1
4,B,-0.405465,0
5,B,-0.405465,1
6,B,-0.405465,0
7,C,0.0,1
8,C,0.0,0
9,D,0.0,1


Unnamed: 0,category,category_representation,target
0,A,0.0,
1,B,-0.405465,
2,C,0.0,
3,D,0.0,
4,NewCategory,0.0,


In [40]:
encoder = JamesSteinEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.5,0
1,A,0.5,1
2,A,0.5,0
3,A,0.5,1
4,B,0.363636,0
5,B,0.363636,1
6,B,0.363636,0
7,C,0.5,1
8,C,0.5,0
9,D,1.0,1


Unnamed: 0,category,category_representation,target
0,A,0.5,
1,B,0.363636,
2,C,0.5,
3,D,1.0,
4,NewCategory,1.0,


In [41]:
# \hat{x}^{k} = \frac{n^{+} + prior * m}{y^{+} + m}

In [42]:
df_train = pd.DataFrame({})
df_train["category"] = ["A", "A", "A", "A", "B", "B", "B", "C", "C"]
df_train["category_representation"] = df_train["category"]
df_train["target"] = [1, 0, 1, 1, 1, 1, 0, 1, 1]

In [43]:
encoder = MEstimateEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.472222,1
1,A,0.472222,0
2,A,0.472222,1
3,A,0.472222,1
4,B,0.347222,1
5,B,0.347222,1
6,B,0.347222,0
7,C,0.347222,1
8,C,0.347222,1


Unnamed: 0,category,category_representation,target
0,A,0.777778,
1,B,0.777778,
2,C,0.777778,
3,D,0.777778,
4,NewCategory,0.777778,


In [44]:
encoder = CatBoostEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.777778,1
1,A,0.888889,0
2,A,0.592593,1
3,A,0.694444,1
4,B,0.777778,1
5,B,0.888889,1
6,B,0.925926,0
7,C,0.777778,1
8,C,0.888889,1


Unnamed: 0,category,category_representation,target
0,A,0.777778,
1,B,0.777778,
2,C,0.777778,
3,D,0.777778,
4,NewCategory,0.777778,


In [45]:
### cb
# train:
# \hat{x}^{k}_{i} = \frac{\sum_{j=0}^{j \leqslant i}(y_{j} * (x_{j}==k)) - y_{i} + prior}{\sum_{j=0}^{j \leqslant i}x_{j}==k}
# test: 
# \hat{x}^{k} = \frac{\sum (y_{j} * (x_{j}==k)) + prior}{\sum x_{j}==k}

### loo
# train: 
# \hat{x}^{k}_{i} = \frac{\sum_{j \neq i}(y_{j} * (x_{j}==k)) - y_{i}}{\sum_{j \neq i}x_{j}==k}
# test:
# \hat{x}^{k} = \frac{\sum (y_{j} * (x_{j}==k))}{\sum x_{j}==k}
# opt th:
# t^{k} = \frac{\sum (y_{j} * (x_{j}==k)) - 0.5}{\sum x_{j}==k}

In [46]:
encoder = HelmertEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp.drop("intercept", axis=1))
temp = encoder.transform(df_test)
display(temp.drop("intercept", axis=1))

Unnamed: 0,category,category_representation_0,category_representation_1,category_representation_2,category_representation_3,target
0,A,-1.0,-1.0,-1.0,-1.0,1
1,A,-1.0,-1.0,-1.0,-1.0,0
2,A,-1.0,-1.0,-1.0,-1.0,1
3,A,-1.0,-1.0,-1.0,-1.0,1
4,B,1.0,-1.0,-1.0,-1.0,1
5,B,1.0,-1.0,-1.0,-1.0,1
6,B,1.0,-1.0,-1.0,-1.0,0
7,C,0.0,2.0,-1.0,-1.0,1
8,C,0.0,2.0,-1.0,-1.0,1


Unnamed: 0,category,category_representation_0,category_representation_1,category_representation_2,category_representation_3,target
0,A,0.0,0.0,0.0,4.0,
1,B,0.0,0.0,0.0,4.0,
2,C,0.0,0.0,0.0,4.0,
3,D,0.0,0.0,0.0,4.0,
4,NewCategory,0.0,0.0,0.0,4.0,


In [47]:
# js
# \hat{x}^{k} = (1-B) * \frac{n^{+}}{n} + B * \frac{y^{+}}{y}

# $$\newcommand{\Var}{\operatorname{Var}}
# B = \frac{\Var[y^{k}]}{\Var[y^{k}] + \Var[y]}

In [48]:
encoder = LeaveOneOutEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.666667,1
1,A,1.0,0
2,A,0.666667,1
3,A,0.666667,1
4,B,0.5,1
5,B,0.5,1
6,B,1.0,0
7,C,1.0,1
8,C,1.0,1


Unnamed: 0,category,category_representation,target
0,A,0.777778,
1,B,0.777778,
2,C,0.777778,
3,D,0.777778,
4,NewCategory,0.777778,


In [49]:
encoder = SumEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp.drop("intercept", axis=1))
temp = encoder.transform(df_test)
display(temp.drop("intercept", axis=1))

Unnamed: 0,category,category_representation_0,category_representation_1,target
0,A,1.0,0.0,1
1,A,1.0,0.0,0
2,A,1.0,0.0,1
3,A,1.0,0.0,1
4,B,0.0,1.0,1
5,B,0.0,1.0,1
6,B,0.0,1.0,0
7,C,-1.0,-1.0,1
8,C,-1.0,-1.0,1


Unnamed: 0,category,category_representation_0,category_representation_1,target
0,A,0.0,0.0,
1,B,0.0,0.0,
2,C,0.0,0.0,
3,D,0.0,0.0,
4,NewCategory,0.0,0.0,
