### Data Encoding <br>
Data Encoding refers to the converting of categorical variables into numerical representations that can be understood by machine learning algorithms

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('IRIS.csv')

In [3]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Label Enconding

In [5]:
df.species.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [6]:
from sklearn import preprocessing

In [7]:
# initialize label encoder
label_enc = preprocessing.LabelEncoder()

In [8]:
label_enc.fit_transform(df.species)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
df['Species Encoded'] = label_enc.fit_transform(df.species)

In [11]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,Species Encoded
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0


In [12]:
df['Species Encoded'].unique()

array([0, 1, 2])

#### Label encoding with pandas

In [13]:
dict = {
    'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2
}

In [14]:
df.species.map(dict)

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: species, Length: 150, dtype: int64

In [15]:
df['species_enc using map'] = df.species.map(dict)

In [16]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,Species Encoded,species_enc using map
0,5.1,3.5,1.4,0.2,Iris-setosa,0,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0,0


### One hot Encoding

#### One hot encode with Pandas get_dummies

In [20]:
pd.get_dummies(df['species'], dtype=int)

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
145,0,0,1
146,0,0,1
147,0,0,1
148,0,0,1


In [22]:
one_hot_species = pd.get_dummies(df['species'], dtype=int)

In [28]:
pd.concat([df, one_hot_species], axis=1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,Species Encoded,species_enc using map,Iris-setosa,Iris-versicolor,Iris-virginica
0,5.1,3.5,1.4,0.2,Iris-setosa,0,0,1,0,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0,0,1,0,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0,0,1,0,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0,0,1,0,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,2,2,0,0,1
146,6.3,2.5,5.0,1.9,Iris-virginica,2,2,0,0,1
147,6.5,3.0,5.2,2.0,Iris-virginica,2,2,0,0,1
148,6.2,3.4,5.4,2.3,Iris-virginica,2,2,0,0,1


#### One hot encoding with Sklearn

In [30]:
from sklearn.preprocessing import OneHotEncoder

In [31]:
# Initialize oneHotEncoder
one_hot = OneHotEncoder()

In [39]:
one_hot.fit_transform(df[['species']]).toarray()

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0