## # Data Encoding - Nominal / One Hot Encoding (OHE) 

### Step-01: Import all the libraries required and create a dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_01 = pd.DataFrame({
    "color" : ["red", "blue", "green", "green", "red", "blue", "blue", "green", "blue", "red"]
})

In [3]:
df_01.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


### Step-02: Import OneHotEncoder from sklearn.preprocessing 

In [4]:
from sklearn.preprocessing import OneHotEncoder

In [5]:
encoder = OneHotEncoder()

In [6]:
encoded = encoder.fit_transform(df_01[["color"]])

In [7]:
encoded.toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [8]:
df_02 = pd.DataFrame(encoded.toarray(), columns = encoder.get_feature_names_out())

In [9]:
df_02.head(10)

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0
6,1.0,0.0,0.0
7,0.0,1.0,0.0
8,1.0,0.0,0.0
9,0.0,0.0,1.0


In [10]:
df = pd.concat([df_01, df_02], axis = 1)

In [11]:
df.head(10)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0
6,blue,1.0,0.0,0.0
7,green,0.0,1.0,0.0
8,blue,1.0,0.0,0.0
9,red,0.0,0.0,1.0


### Step-03: Import Seaborn Datasets

In [12]:
df_01 = sns.load_dataset("iris")

In [13]:
df_01.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [14]:
df_01["species"].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
enocoder = OneHotEncoder()

In [17]:
encoder.fit_transform(df_01[["species"]])

<150x3 sparse matrix of type '<class 'numpy.float64'>'
	with 150 stored elements in Compressed Sparse Row format>

In [18]:
encoded = encoder.fit_transform(df_01[["species"]]).toarray()

In [19]:
df_02 = pd.DataFrame(encoded, columns = encoder.get_feature_names_out())

In [20]:
df_02.head()

Unnamed: 0,species_setosa,species_versicolor,species_virginica
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


In [21]:
df = pd.concat([df_01, df_02], axis = 1)

In [22]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,setosa,1.0,0.0,0.0
1,4.9,3.0,1.4,0.2,setosa,1.0,0.0,0.0
2,4.7,3.2,1.3,0.2,setosa,1.0,0.0,0.0
3,4.6,3.1,1.5,0.2,setosa,1.0,0.0,0.0
4,5.0,3.6,1.4,0.2,setosa,1.0,0.0,0.0


In [23]:
df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_setosa,species_versicolor,species_virginica
145,6.7,3.0,5.2,2.3,virginica,0.0,0.0,1.0
146,6.3,2.5,5.0,1.9,virginica,0.0,0.0,1.0
147,6.5,3.0,5.2,2.0,virginica,0.0,0.0,1.0
148,6.2,3.4,5.4,2.3,virginica,0.0,0.0,1.0
149,5.9,3.0,5.1,1.8,virginica,0.0,0.0,1.0
