In [4]:
import pandas as pd 
import numpy as np
import ast
import re
from tabulate import tabulate
import lightgbm
import category_encoders

In [5]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.backward_difference import BackwardDifferenceEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder

In [12]:
dataset_list = [
    "telecom", "adult", 
    "employee", "credit", "mortgages", 
    "promotion", "kick", "kdd_upselling", "taxi", 
    "poverty_A","poverty_B", "poverty_C",
]

In [13]:
def read_info(path):
    with open(path, "r") as f:
        a = f.read()
    dic = ast.literal_eval(a)["info"]
    print(path, dic["train_shape"][0]+dic["test_shape"][0], dic["train_shape"], dic["test_shape"], dic["num_cat_cols"])
    return dic

In [14]:
for ds_name in dataset_list:
    _ = read_info(f"./results/exp_1_{ds_name}_r.txt")

./results/exp_1_telecom_r.txt 7043 (4225, 20) (2818, 20) 16
./results/exp_1_adult_r.txt 48842 (29305, 15) (19537, 15) 8


In [18]:
def read_dict(path, val_type="None"):
    with open(path, "r") as f:
        a = f.read()

    dic = ast.literal_eval(a)
    res = {}
    for k in dic.keys():
        if k != "info":
            res[k] = dic[k]["test_score"]
    dataframe = pd.DataFrame.from_dict(res, orient='index', columns=[val_type])
    return dataframe

In [19]:
def _f(exp="exp_1", dec=3):
    res = []
    cols = []
    for ds_name in dataset_list:
        cols.append(ds_name)
        d1 = read_dict(f"./results/{exp}_{ds_name}_r.txt")
    
        res.append(d1)

    res = pd.concat(res, axis=1)
    res.columns = cols
    if dec != -1:
        res = np.round(res, dec)
    res = res.sort_index()
    
    indx = []
    for d in res.index:
        indx.append(re.sub(r'[^\w\s]', '', d)) 
    res.index = indx
    return res

r_None = _f(exp="exp_1", dec=-1)
r_Single = _f(exp="exp_2", dec=-1)
r_Double = _f(exp="exp_3", dec=-1)

In [20]:
max_scores = pd.concat([r_None, r_Single, r_Double], axis=0)
max_scores = np.max(max_scores)
max_scores

telecom    0.840515
adult      0.929397
dtype: float64

In [21]:
min_scores = pd.concat([r_None, r_Single, r_Double], axis=0)
min_scores = np.min(min_scores)
min_scores

telecom    0.838619
adult      0.928759
dtype: float64

In [22]:
r_None_scaled = (r_None - min_scores) / (max_scores - min_scores)
r_Single_scaled = (r_Single - min_scores) / (max_scores - min_scores)
r_Double_scaled = (r_Double - min_scores) / (max_scores - min_scores)

In [23]:
np.max(r_None)

telecom    0.839068
adult      0.929397
dtype: float64

In [24]:
np.max(r_Single)

telecom    0.839416
adult      0.929372
dtype: float64

In [25]:
np.max(r_Double)

telecom    0.840515
adult      0.929370
dtype: float64

# Best encoder 

In [26]:
a = r_None_scaled.mean(axis=1).sort_values(ascending=False).round(4)
print(tabulate(pd.DataFrame(a), tablefmt="pipe", headers="keys"))

|                       |      0 |
|:----------------------|-------:|
| MEstimateEncoderRight | 0.5    |
| MEstimateEncoder      | 0.1184 |


In [27]:
a = r_Single_scaled.mean(axis=1).sort_values(ascending=False).round(4)
print(tabulate(pd.DataFrame(a), tablefmt="pipe", headers="keys"))

|                       |      0 |
|:----------------------|-------:|
| MEstimateEncoderRight | 0.5335 |
| MEstimateEncoder      | 0.3789 |


In [28]:
a = r_Double_scaled.mean(axis=1).sort_values(ascending=False).round(4)
print(tabulate(pd.DataFrame(a), tablefmt="pipe", headers="keys"))

|                       |      0 |
|:----------------------|-------:|
| MEstimateEncoder      | 0.8647 |
| MEstimateEncoderRight | 0.678  |


# Influence of validation 

In [29]:
r_None.round(4).fillna("")
print(tabulate(r_None.round(4).fillna(""), tablefmt="pipe", headers="keys"))

|                       |   telecom |   adult |
|:----------------------|----------:|--------:|
| MEstimateEncoder      |    0.8391 |  0.9288 |
| MEstimateEncoderRight |    0.8386 |  0.9294 |


In [30]:
r_Single.round(4).fillna("")
print(tabulate(r_Single.round(4).fillna(""), tablefmt="pipe", headers="keys"))

|                       |   telecom |   adult |
|:----------------------|----------:|--------:|
| MEstimateEncoder      |    0.8394 |  0.929  |
| MEstimateEncoderRight |    0.8388 |  0.9294 |


In [31]:
r_Double.round(4).fillna("")
print(tabulate(r_Double.round(4).fillna(""), tablefmt="pipe", headers="keys"))

|                       |   telecom |   adult |
|:----------------------|----------:|--------:|
| MEstimateEncoder      |    0.8405 |  0.9292 |
| MEstimateEncoderRight |    0.8394 |  0.9294 |


In [32]:
single_none = np.round(((r_Single - r_None) / r_None * 100), 1)
single_none
# print(tabulate(single_none, tablefmt="pipe", headers="keys"))

Unnamed: 0,telecom,adult
MEstimateEncoder,0.0,0.0
MEstimateEncoderRight,0.0,-0.0


In [33]:
a = pd.DataFrame(np.mean(single_none, axis=1))
a.columns = ["None -> Single"]
print(tabulate(a, tablefmt="pipe", headers="keys"))

|                       |   None -> Single |
|:----------------------|-----------------:|
| MEstimateEncoder      |                0 |
| MEstimateEncoderRight |                0 |


In [34]:
double_single = np.round(((r_Double - r_Single) / r_Single * 100), 1)
double_single
# print(tabulate(double_single, tablefmt="pipe", headers="keys"))

Unnamed: 0,telecom,adult
MEstimateEncoder,0.1,0.0
MEstimateEncoderRight,0.1,-0.0


In [22]:
b = pd.DataFrame(np.mean(double_single, axis=1))
b.columns = ["Single -> Double"]
print(tabulate(b, tablefmt="pipe", headers="keys"))

|                           |   Single -> Double |
|:--------------------------|-------------------:|
| BackwardDifferenceEncoder |          nan       |
| CatBoostEncoder           |            0.425   |
| FrequencyEncoder          |           -4.93333 |
| HelmertEncoder            |          nan       |
| JamesSteinEncoder         |            6.29167 |
| LeaveOneOutEncoder        |           53.2417  |
| MEstimateEncoder          |            8.13333 |
| OrdinalEncoder            |          nan       |
| SumEncoder                |          nan       |
| TargetEncoder             |            4.20833 |
| WOEEncoder                |            1.875   |


In [23]:
res = pd.concat([a,b],axis=1)
print(tabulate(res.round(1).fillna(""), tablefmt="pipe", headers="keys"))

|                           |   None -> Single | Single -> Double   |
|:--------------------------|-----------------:|:-------------------|
| BackwardDifferenceEncoder |             27.2 |                    |
| CatBoostEncoder           |             20.1 | 0.4                |
| FrequencyEncoder          |              0.3 | -4.9               |
| HelmertEncoder            |              0.2 |                    |
| JamesSteinEncoder         |             17.7 | 6.3                |
| LeaveOneOutEncoder        |              0.2 | 53.2               |
| MEstimateEncoder          |             18.9 | 8.1                |
| OrdinalEncoder            |             24.1 |                    |
| SumEncoder                |              0   |                    |
| TargetEncoder             |             19.6 | 4.2                |
| WOEEncoder                |             23.4 | 1.9                |


In [24]:
np.mean(res)

None -> Single      13.777525
Single -> Double     9.891667
dtype: float64

# Top scores improvement 

In [25]:
a = np.max(r_Single) - np.max(r_None)
a = pd.DataFrame(a) * 100
a.columns = ["None -> Single"]
print(tabulate(a, tablefmt="pipe", headers="keys"))

|               |   None -> Single |
|:--------------|-----------------:|
| telecom       |      -0.00440061 |
| adult         |       0.0230934  |
| employee      |       1.98344    |
| credit        |      -0.0113489  |
| mortgages     |       0.260293   |
| promotion     |       0.0355479  |
| kick          |      -0.0498629  |
| kdd_upselling |       0.104221   |
| taxi          |       3.77584    |
| poverty_A     |       0.742493   |
| poverty_B     |       5.59024    |
| poverty_C     |       0.475514   |


In [26]:
b = np.max(r_Double) - np.max(r_Single)
b = pd.DataFrame(b) * 100
b.columns = ["Single -> Double"]
print(tabulate(b, tablefmt="pipe", headers="keys"))

|               |   Single -> Double |
|:--------------|-------------------:|
| telecom       |         0.00880122 |
| adult         |        -0.0303435  |
| employee      |         0.391465   |
| credit        |        -0.00252879 |
| mortgages     |        -0.467158   |
| promotion     |        -0.201342   |
| kick          |         0.064122   |
| kdd_upselling |        -0.105947   |
| taxi          |        -0.0089831  |
| poverty_A     |        -0.114748   |
| poverty_B     |         0.285318   |
| poverty_C     |        -0.539663   |


In [27]:
res = pd.concat([a,b],axis=1)
print(tabulate(res.round(2).fillna(""), tablefmt="pipe", headers="keys"))

|               |   None -> Single |   Single -> Double |
|:--------------|-----------------:|-------------------:|
| telecom       |            -0    |               0.01 |
| adult         |             0.02 |              -0.03 |
| employee      |             1.98 |               0.39 |
| credit        |            -0.01 |              -0    |
| mortgages     |             0.26 |              -0.47 |
| promotion     |             0.04 |              -0.2  |
| kick          |            -0.05 |               0.06 |
| kdd_upselling |             0.1  |              -0.11 |
| taxi          |             3.78 |              -0.01 |
| poverty_A     |             0.74 |              -0.11 |
| poverty_B     |             5.59 |               0.29 |
| poverty_C     |             0.48 |              -0.54 |


In [28]:
np.mean(res)

None -> Single      1.077089
Single -> Double   -0.060084
dtype: float64

# Encoders examples

In [29]:
# http://latex2png.com/ 200

In [31]:
df_train = pd.DataFrame({})
df_train["category"] = ["A", "A", "A", "A", "B", "B", "B", "C", "C", "D"]
df_train["category_representation"] = df_train["category"]
df_train["target"] = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]

df_test = pd.DataFrame({})
df_test["category"] = ["A", "B", "C", "D", "NewCategory"]
df_test["category_representation"] = df_test["category"]
df_test["target"] = None

In [32]:
df_train.drop("category_representation", axis=1)

Unnamed: 0,category,target
0,A,0
1,A,1
2,A,0
3,A,1
4,B,0
5,B,1
6,B,0
7,C,1
8,C,0
9,D,1


In [33]:
df_test.drop("category_representation", axis=1)

Unnamed: 0,category,target
0,A,
1,B,
2,C,
3,D,
4,NewCategory,


In [34]:
encoder = OrdinalEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,1,0
1,A,1,1
2,A,1,0
3,A,1,1
4,B,2,0
5,B,2,1
6,B,2,0
7,C,3,1
8,C,3,0
9,D,4,1


Unnamed: 0,category,category_representation,target
0,A,1.0,
1,B,2.0,
2,C,3.0,
3,D,4.0,
4,NewCategory,-1.0,


In [35]:
encoder = OneHotEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation_1,category_representation_2,category_representation_3,category_representation_4,target
0,A,1,0,0,0,0
1,A,1,0,0,0,1
2,A,1,0,0,0,0
3,A,1,0,0,0,1
4,B,0,1,0,0,0
5,B,0,1,0,0,1
6,B,0,1,0,0,0
7,C,0,0,1,0,1
8,C,0,0,1,0,0
9,D,0,0,0,1,1


Unnamed: 0,category,category_representation_1,category_representation_2,category_representation_3,category_representation_4,target
0,A,1,0,0,0,
1,B,0,1,0,0,
2,C,0,0,1,0,
3,D,0,0,0,1,
4,NewCategory,0,0,0,0,


In [36]:
class FrequencyEncoder:
    def __init__(self, cols):
        self.cols = cols
        self.counts_dict = None

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        counts_dict = {}
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict[col] = dict(zip(values, counts))
        self.counts_dict = counts_dict

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        counts_dict_test = {}
        res = []
        for col in self.cols:
            values, counts = np.unique(X[col], return_counts=True)
            counts_dict_test[col] = dict(zip(values, counts))

            # if value is in "train" keys - replace "test" counts with "train" counts
            for k in [key for key in counts_dict_test[col].keys() if key in self.counts_dict[col].keys()]:
                counts_dict_test[col][k] = self.counts_dict[col][k]

            res.append(X[col].map(counts_dict_test[col]).values.reshape(-1, 1))
        res = np.hstack(res)

        X[self.cols] = res
        return X

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        self.fit(X, y)
        X = self.transform(X)
        return X

In [39]:
encoder = FrequencyEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,4,0
1,A,4,1
2,A,4,0
3,A,4,1
4,B,3,0
5,B,3,1
6,B,3,0
7,C,2,1
8,C,2,0
9,D,1,1


Unnamed: 0,category,category_representation,target
0,A,4,
1,B,3,
2,C,2,
3,D,1,
4,NewCategory,1,


In [40]:
encoder = TargetEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.5,0
1,A,0.5,1
2,A,0.5,0
3,A,0.5,1
4,B,0.3532,0
5,B,0.3532,1
6,B,0.3532,0
7,C,0.5,1
8,C,0.5,0
9,D,0.5,1


Unnamed: 0,category,category_representation,target
0,A,0.5,
1,B,0.3532,
2,C,0.5,
3,D,0.5,
4,NewCategory,0.5,


In [41]:
encoder = WOEEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.0,0
1,A,0.0,1
2,A,0.0,0
3,A,0.0,1
4,B,-0.405465,0
5,B,-0.405465,1
6,B,-0.405465,0
7,C,0.0,1
8,C,0.0,0
9,D,0.0,1


Unnamed: 0,category,category_representation,target
0,A,0.0,
1,B,-0.405465,
2,C,0.0,
3,D,0.0,
4,NewCategory,0.0,


In [42]:
encoder = JamesSteinEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.5,0
1,A,0.5,1
2,A,0.5,0
3,A,0.5,1
4,B,0.363636,0
5,B,0.363636,1
6,B,0.363636,0
7,C,0.5,1
8,C,0.5,0
9,D,1.0,1


Unnamed: 0,category,category_representation,target
0,A,0.5,
1,B,0.363636,
2,C,0.5,
3,D,1.0,
4,NewCategory,1.0,


In [44]:
# \hat{x}^{k} = \frac{n^{+} + prior * m}{y^{+} + m}

In [46]:
df_train = pd.DataFrame({})
df_train["category"] = ["A", "A", "A", "A", "B", "B", "B", "C", "C"]
df_train["category_representation"] = df_train["category"]
df_train["target"] = [1, 0, 1, 1, 1, 1, 0, 1, 1]

In [47]:
encoder = MEstimateEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.472222,1
1,A,0.472222,0
2,A,0.472222,1
3,A,0.472222,1
4,B,0.347222,1
5,B,0.347222,1
6,B,0.347222,0
7,C,0.347222,1
8,C,0.347222,1


Unnamed: 0,category,category_representation,target
0,A,0.777778,
1,B,0.777778,
2,C,0.777778,
3,D,0.777778,
4,NewCategory,0.777778,


In [49]:
encoder = CatBoostEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.777778,1
1,A,0.888889,0
2,A,0.592593,1
3,A,0.694444,1
4,B,0.777778,1
5,B,0.888889,1
6,B,0.925926,0
7,C,0.777778,1
8,C,0.888889,1


Unnamed: 0,category,category_representation,target
0,A,0.777778,
1,B,0.777778,
2,C,0.777778,
3,D,0.777778,
4,NewCategory,0.777778,


In [50]:
### cb
# train:
# \hat{x}^{k}_{i} = \frac{\sum_{j=0}^{j \leqslant i}(y_{j} * (x_{j}==k)) - y_{i} + prior}{\sum_{j=0}^{j \leqslant i}x_{j}==k}
# test: 
# \hat{x}^{k} = \frac{\sum (y_{j} * (x_{j}==k)) + prior}{\sum x_{j}==k}

### loo
# train: 
# \hat{x}^{k}_{i} = \frac{\sum_{j \neq i}(y_{j} * (x_{j}==k)) - y_{i}}{\sum_{j \neq i}x_{j}==k}
# test:
# \hat{x}^{k} = \frac{\sum (y_{j} * (x_{j}==k))}{\sum x_{j}==k}
# opt th:
# t^{k} = \frac{\sum (y_{j} * (x_{j}==k)) - 0.5}{\sum x_{j}==k}

In [52]:
encoder = HelmertEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp.drop("intercept", axis=1))
temp = encoder.transform(df_test)
display(temp.drop("intercept", axis=1))

Unnamed: 0,category,category_representation_0,category_representation_1,category_representation_2,category_representation_3,target
0,A,-1.0,-1.0,-1.0,-1.0,1
1,A,-1.0,-1.0,-1.0,-1.0,0
2,A,-1.0,-1.0,-1.0,-1.0,1
3,A,-1.0,-1.0,-1.0,-1.0,1
4,B,1.0,-1.0,-1.0,-1.0,1
5,B,1.0,-1.0,-1.0,-1.0,1
6,B,1.0,-1.0,-1.0,-1.0,0
7,C,0.0,2.0,-1.0,-1.0,1
8,C,0.0,2.0,-1.0,-1.0,1


Unnamed: 0,category,category_representation_0,category_representation_1,category_representation_2,category_representation_3,target
0,A,0.0,0.0,0.0,4.0,
1,B,0.0,0.0,0.0,4.0,
2,C,0.0,0.0,0.0,4.0,
3,D,0.0,0.0,0.0,4.0,
4,NewCategory,0.0,0.0,0.0,4.0,


In [53]:
# js
# \hat{x}^{k} = (1-B) * \frac{n^{+}}{n} + B * \frac{y^{+}}{y}

# $$\newcommand{\Var}{\operatorname{Var}}
# B = \frac{\Var[y^{k}]}{\Var[y^{k}] + \Var[y]}

In [55]:
encoder = LeaveOneOutEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp)
temp = encoder.transform(df_test)
display(temp)

Unnamed: 0,category,category_representation,target
0,A,0.666667,1
1,A,1.0,0
2,A,0.666667,1
3,A,0.666667,1
4,B,0.5,1
5,B,0.5,1
6,B,1.0,0
7,C,1.0,1
8,C,1.0,1


Unnamed: 0,category,category_representation,target
0,A,0.777778,
1,B,0.777778,
2,C,0.777778,
3,D,0.777778,
4,NewCategory,0.777778,


In [56]:
encoder = SumEncoder(cols=["category_representation"])
temp = encoder.fit_transform(df_train, df_train["target"])
display(temp.drop("intercept", axis=1))
temp = encoder.transform(df_test)
display(temp.drop("intercept", axis=1))

Unnamed: 0,category,category_representation_0,category_representation_1,target
0,A,1.0,0.0,1
1,A,1.0,0.0,0
2,A,1.0,0.0,1
3,A,1.0,0.0,1
4,B,0.0,1.0,1
5,B,0.0,1.0,1
6,B,0.0,1.0,0
7,C,-1.0,-1.0,1
8,C,-1.0,-1.0,1


Unnamed: 0,category,category_representation_0,category_representation_1,target
0,A,0.0,0.0,
1,B,0.0,0.0,
2,C,0.0,0.0,
3,D,0.0,0.0,
4,NewCategory,0.0,0.0,
