# Categorical Encoding package 
- **Classic Encoders**: Ordinal, Label, One-Hot, Binary, and Hashing Encoder. 
- **Bayesian Encoders**:  Target and LeaveOneOut
    - only output one column, which eliminates any concerns regarding high-dimensionality that sometimes affect other encoders.
- **Contrast Encoders**: Helmert, Sum, Backward Difference, or Polynomial

P.s: One-Hot and LeaveOneOut encoding are the most popular encoders for a good reason.

<p align="center"><img src='../../assets/img/categorical-encoding-flowchart.png'></p>

## TODO
- [x] Leave One Out Encoder

## Reference
- [Encode Smarter: How to Easily Integrate Categorical Encoding into Your Machine Learning Pipeline](https://innovation.alteryx.com/encode-smarter/)

In [50]:
import category_encoders as ce
import pandas as pd
import numpy as np

In [51]:
import warnings
warnings.filterwarnings('ignore')

In [52]:
df = pd.read_csv('../input/dataset/cat_train.csv', index_col='id')

In [54]:
numeric_features = df.select_dtypes([np.number]).drop(['target'], axis=1).columns
categorical_features = df.select_dtypes(exclude=[np.number]).columns

In [None]:
df.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [None]:
df['target'].unique()

array([0, 1])

### One Hot Encoder

In [None]:
onehot_enc = ce.OneHotEncoder(use_cat_names=True)

In [None]:
df['nom_1'].isna().sum()

18156

In [None]:
df['nom_1']  = df['nom_1'].fillna("NONE").astype(str)

In [None]:
onehot_enc.fit_transform(df['nom_1']).head()

Unnamed: 0_level_0,nom_1_Trapezoid,nom_1_Star,nom_1_NONE,nom_1_Circle,nom_1_Triangle,nom_1_Polygon,nom_1_Square
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0


### Binary Encoder

In [None]:
bin_enc = ce.BinaryEncoder()
bin_enc.fit_transform(df['nom_2']).head()


Unnamed: 0_level_0,nom_2_0,nom_2_1,nom_2_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,1
1,0,1,0
2,0,0,1
3,0,0,1
4,0,0,1


## Experiments with difference Encoder

In [55]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier

In [56]:
X = df.drop('target', axis=1)
y = df['target']

In [57]:
y = preprocessing.LabelEncoder().fit_transform(y)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [59]:
selected_model = XGBClassifier(random_state=0)


In [61]:
encoders = {
    'BackwardDifferenceEncoder': ce.backward_difference.BackwardDifferenceEncoder,
    'BaseNEncoder': ce.basen.BaseNEncoder,
    'BinaryEncoder': ce.binary.BinaryEncoder,
    'CatBoostEncoder': ce.cat_boost.CatBoostEncoder,
    'HashingEncoder': ce.hashing.HashingEncoder,
    'HelmertEncoder': ce.helmert.HelmertEncoder,
    'JamesSteinEncoder': ce.james_stein.JamesSteinEncoder,
    'OneHotEncoder': ce.one_hot.OneHotEncoder,
    'LeaveOneOutEncoder': ce.leave_one_out.LeaveOneOutEncoder,
    'MEstimateEncoder': ce.m_estimate.MEstimateEncoder,
    'OrdinalEncoder': ce.ordinal.OrdinalEncoder,
    'PolynomialEncoder': ce.polynomial.PolynomialEncoder,
    'SumEncoder': ce.sum_coding.SumEncoder,
    'TargetEncoder': ce.target_encoder.TargetEncoder,
    'WOEEncoder': ce.woe.WOEEncoder
}

In [62]:
df_results = pd.DataFrame(columns=['encoder', 'f1', 'accuracy', 'roc'])
for key in encoders:
    categorical_transformer = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', encoders[key]())
        ]
    )    
    numeric_transformer = Pipeline(
        steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', numeric_transformer, numeric_features),
            ('categorical', categorical_transformer, categorical_features)
        ]
    )

    pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', selected_model)
        ]
    )

    model = pipe.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    row = {
        'encoder': key,
        'f1': f1_score(y_test, y_pred, average='macro'),
        'accuracy': accuracy_score(y_test, y_pred),
        'roc': roc_auc_score(y_test, y_pred)
    }

    df_results = df_results.append(row, ignore_index=True)