# preprocessing categories into integer

In [1]:
from sklearn import preprocessing

In [48]:
# Data
game = [['first','silver','bottom'],
        ['second','gold', 'top'],
        ['third', 'bronze','middle']
       ]

## `OrdinalEncoder()`

In [4]:
ordinal_encoder = preprocessing.OrdinalEncoder()

In [5]:
ordinal_encoder.fit(game)

In the below categories it shows the order of the categories and the value is replaced with the index <br>
here we found that the categories are auto arranged. That's why in medal category the order is not proper.

In [6]:
ordinal_encoder.categories_

[array(['first', 'second', 'third'], dtype=object),
 array(['bronze', 'gold', 'silver'], dtype=object),
 array(['bottom', 'middle', 'top'], dtype=object)]

In [11]:
ordinal_encoder.transform([['third', 'gold', 'bottom']])

array([[2., 1., 0.]])

### Ordering the category

In [12]:
winner = ['first', 'second', 'third']
medal = ['gold', 'silver', 'bronze']
rank = ['top', 'middle', 'bottom']

In [16]:
ordinal_encoder_custom_order = preprocessing.OrdinalEncoder(categories=[winner, medal, rank])
ordinal_encoder_custom_order.fit(game)

In [18]:
print('OrdinalEncoder with auto:',ordinal_encoder.transform([['third', 'gold', 'bottom']]))
print('OrdinalEncoder with cutom:',ordinal_encoder_custom_order.transform([['third', 'gold', 'bottom']]))

OrdinalEncoder with auto: [[2. 1. 0.]]
OrdinalEncoder with cutom: [[2. 0. 2.]]


## `OneHotEncoder()`

In [20]:
game

[['first', 'silver', 'bottom'],
 ['second', 'gold', 'top'],
 ['third', 'bronze', 'middle']]

In [21]:
one_hot_encoder = preprocessing.OneHotEncoder()
one_hot_encoder.fit(game)

In [22]:
one_hot_encoder.categories_

[array(['first', 'second', 'third'], dtype=object),
 array(['bronze', 'gold', 'silver'], dtype=object),
 array(['bottom', 'middle', 'top'], dtype=object)]

In [23]:
one_hot_encoder.transform([['third', 'gold', 'bottom']])

<1x9 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

here there is an individual column is created for each category. <br>
If the category is present the value is 1 else value will be 0.

In [26]:
one_hot_encoder.transform([['third', 'gold', 'bottom']]).toarray()

array([[0., 0., 1., 0., 1., 0., 1., 0., 0.]])

### Ordering the category

In [27]:
one_hot_encoder_custom_order = preprocessing.OneHotEncoder(categories=[winner, medal, rank])
one_hot_encoder_custom_order.fit(game)

In [29]:
print('OneHotEncoder with auto:',one_hot_encoder.transform([['third', 'gold', 'bottom']]).toarray())
print('OneHotEncoder with cutom:',one_hot_encoder_custom_order.transform([['third', 'gold', 'bottom']]).toarray())

OneHotEncoder with auto: [[0. 0. 1. 0. 1. 0. 1. 0. 0.]]
OneHotEncoder with cutom: [[0. 0. 1. 1. 0. 0. 0. 0. 1.]]


### What happen if we pass category which are not mentioned?

In [30]:
one_hot_encoder.transform([['third', 'platinum', 'bottom']]).toarray()

ValueError: Found unknown categories ['platinum'] in column 1 during transform

#### To avoid the error while passing the unknown category we write a parameter `handle_unknown` = `'ignore'` <br>
Following can be used: handle_unknown{‘error’, ‘use_encoded_value’}, default=’error’

In [34]:
one_hot_encoder_unknown_cat = preprocessing.OneHotEncoder(categories=[winner, medal, rank], handle_unknown='ignore')
one_hot_encoder_unknown_cat.fit(game)

Just ignored the 2nd feature's all the category and marked 0 to all the categories of 2nd feature.

In [37]:
one_hot_encoder_unknown_cat.transform([['third', 'platinum', 'bottom']]).toarray()

array([[0., 0., 1., 0., 0., 0., 0., 0., 1.]])

## What if the category of the feature are binary? <br>
eg: True/false or male/female

For that we use a parameter called `drop` = `'if_binary'`

In [49]:
game

[['first', 'silver', 'bottom'],
 ['second', 'gold', 'top'],
 ['third', 'bronze', 'middle']]

In [50]:
select = ['False', 'True']
for i in range(len(game)):
    game[i].append(select[i%len(select)])
    

In [51]:
game

[['first', 'silver', 'bottom', 'False'],
 ['second', 'gold', 'top', 'True'],
 ['third', 'bronze', 'middle', 'False']]

In [52]:
enc_binary_category = preprocessing.OneHotEncoder(categories=[winner, medal, rank, select], drop='if_binary')
enc_binary_category.fit(game)

Here total no. of categories are 11 but the array have only 10 values.

In [54]:
enc_binary_category.transform([['third', 'gold', 'bottom','True']]).toarray()

array([[0., 0., 1., 1., 0., 0., 0., 0., 1., 1.]])

## Deal with dictionary values category.

## `from sklearn.feature_extraction import DictVectorizer`

In [58]:
plants = [
    {'fruit': 'pear', 'weight': 178.},
    {'fruit': 'pomegranate', 'weight': 250.},
    {'fruit': 'cherry', 'weight': 5.}
]

In [59]:
from sklearn.feature_extraction import DictVectorizer

In [60]:
dict_enc = DictVectorizer()
dict_enc.fit(plants)

In [62]:
dict_enc.transform(plants).toarray()

array([[  0.,   1.,   0., 178.],
       [  0.,   0.,   1., 250.],
       [  1.,   0.,   0.,   5.]])

In [63]:
dict_enc.feature_names_

['fruit=cherry', 'fruit=pear', 'fruit=pomegranate', 'weight']

In [64]:
prec = DictVectorizer().fit(game)

AttributeError: 'list' object has no attribute 'items'