# Data encoding [One hot encoding]

One hot encoding also called as nominal encoding. It is a technique to convert categorical data into numerical data, which is suitable for machine learning algorithm. In this technique the data convert into binary vector, where each vector represent a value.

For example : color[red,blue,green]

In hot encoding we convert this into numerical data as follows:
color[1,0,0] for red,
color[0,1,0] for blue,
color[0,0,1] for green,

In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
color = ['red','green','green','red','blue','red','green','blue','red','blue','red','green','blue']
data = pd.DataFrame(color, columns = ['color'])
data.head()

Unnamed: 0,color
0,red
1,green
2,green
3,red
4,blue


In [41]:
# Importing necessary libraries
from sklearn.preprocessing import OneHotEncoder

# giving instace to a variable
encoder = OneHotEncoder()

# perform fit and transform
encoded = encoder.fit_transform(data[['color']]).toarray()
# converting into array 
encoded

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [None]:
# giving column names
df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out(['color']))
df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,0.0,1.0
6,0.0,1.0,0.0
7,1.0,0.0,0.0
8,0.0,0.0,1.0
9,1.0,0.0,0.0


In [48]:
final = pd.concat([data,df], axis = 1) 
final

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,green,0.0,1.0,0.0
2,green,0.0,1.0,0.0
3,red,0.0,0.0,1.0
4,blue,1.0,0.0,0.0
5,red,0.0,0.0,1.0
6,green,0.0,1.0,0.0
7,blue,1.0,0.0,0.0
8,red,0.0,0.0,1.0
9,blue,1.0,0.0,0.0


# Applying OneHotEncoding on Tips dataset

In [52]:
import seaborn as sns
df = sns.load_dataset('tips') # importing tips dataset out of which [SEX,Smoker,DAY,TIME ARE CATEGORICAL DATA]
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [69]:
df['sex'].value_counts() # [Male, Female]
df['smoker'].value_counts() #[Yes,No]
df['day'].value_counts() #[Sat,Sun,Thur,Fri]
df['time'].value_counts() #[Dinner, Lunch]


time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [78]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

encoded = encoder.fit_transform(df[['sex','smoker','day','time']]).toarray()

encoded.shape

(244, 10)

In [83]:
# Converting upper array into dataframe
encoded_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out(['sex','smoker','day','time']))
encoded_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [102]:
final_df = pd.concat([df[['total_bill','tip']],encoded_df, df['size']], axis = 1)
final_df.head()

Unnamed: 0,total_bill,tip,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size
0,16.99,1.01,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2
1,10.34,1.66,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3
2,21.01,3.5,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3
3,23.68,3.31,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2
4,24.59,3.61,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4


In [114]:
# Retriving the list where a Non Smoker female had dinner on saturday

final_df[
    (df['sex_Female'] == 1) &
    (df['smoker_No'] == 1) &
    (df['day_Sat'] == 1) &
    (df['time_Dinner'] == 1)
    ]

Unnamed: 0,total_bill,tip,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,size
21,20.29,2.75,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2
22,15.77,2.23,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2
29,19.65,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2
32,15.06,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2
33,20.69,2.45,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4
37,16.93,3.07,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3
57,26.41,1.5,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2
66,16.45,2.47,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2
71,17.07,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3
74,14.73,2.2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2
