In [1]:
import numpy as np

from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer, FeatureHasher


In [2]:
# For reproducibility
np.random.seed(1000)

In [3]:
Y = np.random.choice(('Male', 'Female'), size=(10))
    
# Encode the labels
print('Label encoding')
le = LabelEncoder()
yt = le.fit_transform(Y)
print(yt)

Label encoding
[0 0 0 1 0 1 1 0 0 1]


In [4]:
# Decode a dummy output
print('Label decoding')
output = [1, 0, 1, 1, 0, 0]
decoded_output = [le.classes_[i] for i in output]
print(decoded_output)

Label decoding
['Male', 'Female', 'Male', 'Male', 'Female', 'Female']


In [5]:
# Binarize the labels
print('Label binarization')
lb = LabelBinarizer()
yb = lb.fit_transform(Y)
print(yb)

Label binarization
[[0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]]


In [6]:
# Decode the binarized labels
print('Label decoding')
lb.inverse_transform(yb)

Label decoding


array(['Female', 'Female', 'Female', 'Male', 'Female', 'Male', 'Male',
       'Female', 'Female', 'Male'], dtype='<U6')

In [7]:
# Define some dictionary data
data = [
    {'feature_1': 10, 'feature_2': 15},
    {'feature_1': -5, 'feature_3': 22},
    {'feature_3': -2, 'feature_4': 10}
]

In [8]:
# Vectorize the dictionary data
print('Dictionary data vectorization')
dv = DictVectorizer()
Y_dict = dv.fit_transform(data)
print(Y_dict.todense())
print('Vocabulary:')
print(dv.vocabulary_)

Dictionary data vectorization
[[10. 15.  0.  0.]
 [-5.  0. 22.  0.]
 [ 0.  0. -2. 10.]]
Vocabulary:
{'feature_1': 0, 'feature_2': 1, 'feature_3': 2, 'feature_4': 3}


In [9]:
# Feature hashing
print('Feature hashing')
fh = FeatureHasher()
Y_hashed = fh.fit_transform(data)

# Decode the features
print('Feature decoding')
print(Y_hashed.todense())

Feature hashing
Feature decoding
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
# One-hot encoding
data1 = [
    [0, 10],
    [1, 11],
    [1, 8],
    [0, 12],
    [0, 15]
]

In [11]:
# The lines below won't work.
# Encode data
# categorical_features takes in a mask, which is applied on data1.
# oh = OneHotEncoder(categorical_features=[0])
# Y_oh = oh.fit_transform(data1)
# print(Y_oh.todense())

In [13]:
#sklearn.__version__

In [14]:
# categorical_features is depracated from version 0.22. 
from sklearn.compose import ColumnTransformer
# The first dim is extended into one-hot vectors and the other columns were "passed through". 
ct = ColumnTransformer([('oh',OneHotEncoder(),[0])], remainder="passthrough")
ct.fit_transform(data1)

array([[1.0, 0.0, 10],
       [0.0, 1.0, 11],
       [0.0, 1.0, 8],
       [1.0, 0.0, 12],
       [1.0, 0.0, 15]], dtype=object)