In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, Normalizer, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

<a name='catvars'></a>
# Processing Categorical Variables

__Ordinal__ vs __OneHot__ vs __Dummy__ Encoding [[link]](https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/)

<a name="ordinalenc"></a>
## Ordinal Encoding
`OrdinalEncoder` [[documentation]](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html) 

* Each unique category value is assigned an integer value.
* For ordinal variables.
* It expects 2D input (matrix).

In [2]:
# define data as 2D array
data = np.asarray([['red'], ['blue'], ['green']])
print(f"Data:\n{data}\n\nShape:\n{data.shape}\n")

# instantiate encoder
enc = OrdinalEncoder()

# transform data
data_encoded = enc.fit_transform(data)
print(f"Ordinaly encoced data:\n{data_encoded}")

Data:
[['red']
 ['blue']
 ['green']]

Shape:
(3, 1)

Ordinaly encoced data:
[[2.]
 [0.]
 [1.]]


<a name="labelenc"></a>
## Target Encoding
`LabelEncoder` [[documentation]](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) 

* Each unique category value is assigned an integer value.
* It expects 1D input.

In [3]:
# define data as 1D array
data_1D = np.asarray(['red', 'blue', 'green'])
print(f"Data:\n{data_1D}\n\nShape:\n{data_1D.shape}\n")

# instantiate encoder
enc = LabelEncoder()

# fit_transform labels
targets_encoded = enc.fit_transform(data_1D)
print(f"Ordinaly encoced data:\n{data_encoded}")

Data:
['red' 'blue' 'green']

Shape:
(3,)

Ordinaly encoced data:
[[2.]
 [0.]
 [1.]]


<a name="onehotenc"></a>
## One-Hot Encoding
`OneHotEncoder` [[documentation]](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) 

* First the categories are sorted, then binary variables are created for each category in turn. 
* Can manually specify labels via the `categories` argument as a list.
* Can handle labels that was not seen during training via the `handle_unknown` argument.

In [4]:
# one-hot encoding

print(f"Data:\n{data}\n\nShape:\n{data.shape}\n")
# instantiate encoder
enc = OneHotEncoder(sparse=False)

# transform data
data_encoded = enc.fit_transform(data)
print(f"One-Hot encoded data:\n{data_encoded}")

Data:
[['red']
 ['blue']
 ['green']]

Shape:
(3, 1)

One-Hot encoded data:
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]]


<a name="dummyenc"></a>
## Dummy Variable Encoding
`OneHotEncoder` [[documentation]](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) 

* "Solves" the redundancy issue of One-Hot encoding as it represents $C$ categories with $C-1$ binary variables.
* Dummy variable representation is required for some models (linear regression).
* The `drop` argument can be set to indicate which category will be come the one that is assigned all zero values, called the “baseline“. We can set this to `"first"` so that the first category is used.

In [5]:
print(f"Data:\n{data}\n\nShape:\n{data.shape}\n")

# instantiate encoder
enc = OneHotEncoder(drop='first', sparse=False)

# transform data
data_encoded = enc.fit_transform(data)
print(f"Dummy-encoded data:\n{data_encoded}")

Data:
[['red']
 ['blue']
 ['green']]

Shape:
(3, 1)

Dummy-encoded data:
[[0. 1.]
 [0. 0.]
 [1. 0.]]


<a name="getdumenc"></a>
## Get Dummies
`pd.get_dummies` [[documentation]](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html) 

In [6]:
print(f"Data:\n{data_1D}\n\nShape:\n{data_1D.shape}\n")

# transform data
data_encoded = pd.get_dummies(data_1D)
print(f"Dummy-encoded data:\n{data_encoded}")

Data:
['red' 'blue' 'green']

Shape:
(3,)

Dummy-encoded data:
   blue  green  red
0     0      0    1
1     1      0    0
2     0      1    0
