[Reference](https://medium.com/@chyun55555/data-preprocessing-methods-with-scikit-learn-python-98437e8d93cb)

# 1. Data Encoding
## a) Label Encoding

In [7]:
import pandas as pd

In [1]:
basket = ['apple', 'orange', 'grape', 'strawberry', 'melon', 'plum', 'banana', 'melon', 'plum', 'plum', 'grape', 'watermelon', 'melon', 'orange']
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = encoder.fit_transform(basket)
print(labels)

[0 4 2 6 3 5 1 3 5 5 2 7 3 4]


In [2]:
for i in range(len(encoder.classes_)):
    print(f'{i} : {encoder.classes_[i]}')

0 : apple
1 : banana
2 : grape
3 : melon
4 : orange
5 : plum
6 : strawberry
7 : watermelon


In [4]:
encoder = LabelEncoder()
encoder.fit(basket)
labels = encoder.fit_transform(basket)
encoder.inverse_transform(labels)

array(['apple', 'orange', 'grape', 'strawberry', 'melon', 'plum',
       'banana', 'melon', 'plum', 'plum', 'grape', 'watermelon', 'melon',
       'orange'], dtype='<U10')

## b) One-Hot Encoding


In [5]:
basket = ['apple', 'orange', 'grape', 'strawberry', 'melon', 'plum', 'banana', 'melon', 'plum', 'plum', 'grape', 'watermelon', 'melon', 'orange']
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
basket = ['apple', 'orange', 'grape', 'strawberry', 'melon', 'plum', 'banana', 'melon', 'plum', 'plum', 'grape', 'watermelon', 'melon', 'orange']
encoder = LabelEncoder()
labels = encoder.fit_transform(basket).reshape(-1, 1)
onehot_encoder = OneHotEncoder()
onehot_labels = onehot_encoder.fit_transform(labels)
onehot_labels.toarray()

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.]])

In [8]:
basket_df = pd.DataFrame(basket, columns = ['Fruit'])
pd.get_dummies(basket_df)

Unnamed: 0,Fruit_apple,Fruit_banana,Fruit_grape,Fruit_melon,Fruit_orange,Fruit_plum,Fruit_strawberry,Fruit_watermelon
0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0
6,0,1,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0
8,0,0,0,0,0,1,0,0
9,0,0,0,0,0,1,0,0


# 2. Feature Scaling


In [9]:
from sklearn.datasets import load_iris
import pandas as pd
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns = iris.feature_names)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## a) StandardScaler()


In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standard_iris = scaler.fit_transform(iris_df)
standard_iris = pd.DataFrame(standard_iris, columns = iris.feature_names)
standard_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [11]:
standard_iris.mean()

sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64

In [12]:
standard_iris.var()

sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64

## b) MinMaxScaler()


In [13]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
minmax_iris = minmax.fit_transform(iris_df)
minmax_iris = pd.DataFrame(minmax_iris, columns=iris.feature_names)
minmax_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


In [14]:
print(minmax_iris.min(), minmax_iris.max())

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64 sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64
