Machine Learning Mastery
    
    AUTHOR: Dr. Jason Brownlee 

### Ordinal and One-Hot Encodings for Categorical Data

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
import os
DIR = r"C:\Coding\Masters\DataScience\Semana 6\Machine-Learning-Mastery_Encoding"
os.chdir(DIR)

In [5]:
import pandas as pd
from numpy import asarray
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

**NOTA**. Sólo se incluye el código de la codificación, que es lo relevante al tema de Ingeniería de características

Se incluye un ejemplo de **codificación binaria**

## Ordinal Encoding

In [6]:
# define data
data = asarray([['red'], ['green'], ['blue']])
print(data)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = encoder.fit_transform(data)
print(result)

[['red']
 ['green']
 ['blue']]
[[2.]
 [1.]
 [0.]]


## One-Hot Encoding

In [7]:
# define data
data = asarray([['red'], ['green'], ['blue']])
print(data)
# define one hot encoding
encoder = OneHotEncoder(sparse_output=False)
# transform data
onehot = encoder.fit_transform(data)
print(onehot)

[['red']
 ['green']
 ['blue']]
[[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


## Dummy Variable Encoding

In [8]:
# define data
data = asarray([['red'], ['green'], ['blue']])
print(data)
# define one hot encoding
encoder = OneHotEncoder(drop='first', sparse_output=False)
# transform data
onehot = encoder.fit_transform(data)
print(onehot)

[['red']
 ['green']
 ['blue']]
[[0. 1.]
 [1. 0.]
 [0. 0.]]


## Binary Encoding

Binary encoding is a categorical encoding technique that uses binary code – that is, a sequence of zeroes and ones – to represent the different categories of the variable. 

Binary encoding encodes the data in fewer dimensions than one-hot encoding. More generally, we determine the number of binary features needed to encode a variable as log2(number of distinct categories). This is particularly useful when we have highly cardinal variables. For example, if a variable contains 128 unique categories, with one-hot encoding, we would need 127 features to encode the variable, whereas with binary encoding, we would only need 7 (log2(128)=7). 

In [9]:
# !pip install category_encoders
from category_encoders.binary import BinaryEncoder

In [10]:
# define data
data = asarray([['red'], ['green'], ['blue']])
print(data)
# define binary encoding
encoder = BinaryEncoder()
# transform data
result = encoder.fit_transform(data)
print(result)

[['red']
 ['green']
 ['blue']]
   0_0  0_1
0    0    1
1    1    0
2    1    1


# Breast Cancer Dataset
## OrdinalEncoder Transform

In [11]:
# load dataset
dataset = pd.read_csv('breast-cancer.csv', header=None)
# retrieve the array of data
data = dataset.values

In [12]:
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

In [13]:
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
X = ordinal_encoder.fit_transform(X)

In [14]:
# ordinal encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [15]:
# summarize the transformed data
print('Input', X.shape)
print(X[:5, :])
print('Output', y.shape)
print(y[:5])

Input (286, 9)
[[2. 2. 2. 0. 1. 2. 1. 2. 0.]
 [3. 0. 2. 0. 0. 0. 1. 0. 0.]
 [3. 0. 6. 0. 0. 1. 0. 1. 0.]
 [2. 2. 6. 0. 1. 2. 1. 1. 1.]
 [2. 2. 5. 4. 1. 1. 0. 4. 0.]]
Output (286,)
[1 0 1 0 1]


In [16]:
# Same previous example with dataframe treatment
dataset.columns = ['age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat','Class']
# Columns to be encoded are specified
categorical_variables = ['age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat']
ordinal_encoder = OrdinalEncoder()
ordinal_ar = ordinal_encoder.fit_transform(dataset[categorical_variables])
# The returned ndarray is converted to a dataframe
ordinal_df = pd.DataFrame(ordinal_ar, columns=categorical_variables)
ordinal_df

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,2.0,2.0,2.0,0.0,1.0,2.0,1.0,2.0,0.0
1,3.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3.0,0.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0
3,2.0,2.0,6.0,0.0,1.0,2.0,1.0,1.0,1.0
4,2.0,2.0,5.0,4.0,1.0,1.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...
281,3.0,0.0,5.0,5.0,1.0,1.0,0.0,1.0,0.0
282,3.0,2.0,4.0,4.0,1.0,1.0,0.0,1.0,1.0
283,1.0,2.0,5.0,5.0,1.0,1.0,1.0,4.0,0.0
284,3.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0


# Breast Cancer Dataset
## OneHotEncoder Transform

In [36]:
# load dataset
dataset = pd.read_csv('breast-cancer.csv', header=None)
# retrieve the array of data
data = dataset.values
dataset.columns = ['age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat','Class']
dataset

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'
...,...,...,...,...,...,...,...,...,...,...
281,'50-59','ge40','30-34','6-8','yes','2','left','left_low','no','no-recurrence-events'
282,'50-59','premeno','25-29','3-5','yes','2','left','left_low','yes','no-recurrence-events'
283,'30-39','premeno','30-34','6-8','yes','2','right','right_up','no','no-recurrence-events'
284,'50-59','premeno','15-19','0-2','no','2','right','left_low','no','no-recurrence-events'


In [37]:
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

In [38]:
# one hot encode input variables
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)
X = onehot_encoder.fit_transform(X)
X

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.]], shape=(286, 34))

In [39]:
# ordinal encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
# summarize the transformed data
print('Input', X.shape)
print(X[:5, :])

Input (286, 34)
[[0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 1. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 1. 1. 1. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.
  0. 1. 0. 0. 0. 0. 0. 1. 0. 0.]]


In [33]:
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'
...,...,...,...,...,...,...,...,...,...,...
281,'50-59','ge40','30-34','6-8','yes','2','left','left_low','no','no-recurrence-events'
282,'50-59','premeno','25-29','3-5','yes','2','left','left_low','yes','no-recurrence-events'
283,'30-39','premeno','30-34','6-8','yes','2','right','right_up','no','no-recurrence-events'
284,'50-59','premeno','15-19','0-2','no','2','right','left_low','no','no-recurrence-events'


In [40]:
# Same previous example with dataframe treatment

# Columns to be encoded are specified
categorical_variables = ['age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat']
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)
onehot_ar = onehot_encoder.fit_transform(dataset[categorical_variables])
# The returned ndarray is converted to a dataframe
onehot_df = pd.DataFrame(onehot_ar)
onehot_df.columns = onehot_encoder.get_feature_names_out()
onehot_df

Unnamed: 0,age_'30-39',age_'40-49',age_'50-59',age_'60-69',age_'70-79',menopause_'lt40',menopause_'premeno',tumor-size_'10-14',tumor-size_'15-19',tumor-size_'20-24',...,node-caps_nan,deg-malig_'2',deg-malig_'3',breast_'right',breast-quad_'left_low',breast-quad_'left_up',breast-quad_'right_low',breast-quad_'right_up',breast-quad_nan,irradiat_'yes'
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
282,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
283,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
284,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


# Breast Cancer Dataset
## BinaryEncoder Transform

In [23]:
dataset = pd.read_csv('breast-cancer.csv', header=None)
dataset.columns = ['age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat','Class']
# Columns to be encoded are specified
categorical_variables = ['age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat']
binary_encoder = BinaryEncoder()
binary_df = binary_encoder.fit_transform(dataset[categorical_variables])
# Returns directly a dataframe with the names in the columns
binary_df

Unnamed: 0,age_0,age_1,age_2,menopause_0,menopause_1,tumor-size_0,tumor-size_1,tumor-size_2,tumor-size_3,inv-nodes_0,...,node-caps_1,deg-malig_0,deg-malig_1,breast_0,breast_1,breast-quad_0,breast-quad_1,breast-quad_2,irradiat_0,irradiat_1
0,0,0,1,0,1,0,0,0,1,0,...,1,0,1,0,1,0,0,1,0,1
1,0,1,0,1,0,0,0,0,1,0,...,0,1,0,0,1,0,1,0,0,1
2,0,1,0,1,0,0,0,1,0,0,...,0,1,1,1,0,0,1,1,0,1
3,0,0,1,0,1,0,0,1,0,0,...,1,0,1,0,1,0,1,1,1,0
4,0,0,1,0,1,0,0,1,1,0,...,1,1,1,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,0,1,0,1,0,0,0,1,1,1,...,1,1,1,1,0,0,1,1,0,1
282,0,1,0,0,1,0,1,0,0,0,...,1,1,1,1,0,0,1,1,1,0
283,1,0,0,0,1,0,0,1,1,1,...,1,1,1,0,1,1,0,0,0,1
284,0,1,0,0,1,0,0,0,1,0,...,0,1,1,0,1,0,1,1,0,1


In [24]:
print(dataset['age'].nunique())
print(dataset['age'].unique())

6
["'40-49'" "'50-59'" "'60-69'" "'30-39'" "'70-79'" "'20-29'"]


6 categories in `age` were transformed in:

*   (6 - 1) = 5 columns with onehot encoding (`age_'30-39'`, `age_'40-49'`, `age_'50-59'`, `age_'60-69'`, `age_'70-79'`)
*   log2(6) = 2.5849... = 3 columns with binary encoding (`age_0`, `age_1`, `age_2`)

