# One Hot Encoding

With one-hot encoding, we convert each categorical value into a new categorical column and assign a binary value of 1 or 0 to those columns. Each integer value is represented as a binary vector.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv', usecols=['Sex'])

In [3]:
df

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male
...,...
886,male
887,female
888,female
889,male


In [4]:
pd.get_dummies(df,drop_first=True)  

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [5]:
df2 = pd.read_csv('titanic.csv', usecols=['Embarked'])

In [6]:
df2["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [7]:
df.dropna(inplace=True)

In [8]:
pd.get_dummies(df2,drop_first=True)

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


## One hot encoding with many categories/labels in a feature

In [9]:
df= pd.read_csv('mercedes.csv',usecols=["X0","X1","X2","X3","X4","X5","X6",])

In [10]:
df

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,az,v,n,f,d,t,a
1,t,b,ai,a,d,b,g
2,az,v,as,f,d,a,j
3,az,l,n,f,d,z,l
4,w,s,as,c,d,y,i
...,...,...,...,...,...,...,...
4204,aj,h,as,f,d,aa,j
4205,t,aa,ai,d,d,aa,j
4206,y,v,as,f,d,aa,d
4207,ak,v,as,a,d,aa,c


In [11]:
# printing unique labels/categories for each column
for i in df.columns:
    print(len(df[i].unique()))

49
27
45
7
4
32
12


In [12]:
df.X1.value_counts().sort_values(ascending = False)

aa    826
s     602
l     599
b     596
v     436
r     252
i     189
a     153
c     142
o      81
w      50
u      40
z      31
e      29
m      27
h      27
j      22
y      21
t      18
n      16
f      12
k      12
p      10
g       9
ab      5
q       3
d       1
Name: X1, dtype: int64

##### We take only top 10 labels/categories if there are a lot of labels/categories in a feature

In [13]:
top_10_lst = df.X1.value_counts().sort_values(ascending=False).head(10).index
top_10_lst

Index(['aa', 's', 'l', 'b', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object')

In [14]:
top_10_lst = list(top_10_lst)      #typecasting indexes into list
top_10_lst

['aa', 's', 'l', 'b', 'v', 'r', 'i', 'a', 'c', 'o']

In [15]:
import numpy as np
for categories in top_10_lst:
    df[categories] = np.where(df['X1']==categories,1,0)

In [16]:
df

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,l,b,v,r,i,a,c,o
0,az,v,n,f,d,t,a,0,0,0,0,1,0,0,0,0,0
1,t,b,ai,a,d,b,g,0,0,0,1,0,0,0,0,0,0
2,az,v,as,f,d,a,j,0,0,0,0,1,0,0,0,0,0
3,az,l,n,f,d,z,l,0,0,1,0,0,0,0,0,0,0
4,w,s,as,c,d,y,i,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,aj,h,as,f,d,aa,j,0,0,0,0,0,0,0,0,0,0
4205,t,aa,ai,d,d,aa,j,1,0,0,0,0,0,0,0,0,0
4206,y,v,as,f,d,aa,d,0,0,0,0,1,0,0,0,0,0
4207,ak,v,as,a,d,aa,c,0,0,0,0,1,0,0,0,0,0


In [17]:
df[top_10_lst]        #only printing the encoded list labels

Unnamed: 0,aa,s,l,b,v,r,i,a,c,o
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,0,0,0,0
4205,1,0,0,0,0,0,0,0,0,0
4206,0,0,0,0,1,0,0,0,0,0
4207,0,0,0,0,1,0,0,0,0,0


In [18]:
top_10_lst.append('X1')   # appending X1 feature for comparing purposes

In [19]:
df[top_10_lst]

Unnamed: 0,aa,s,l,b,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,1,0,0,0,0,0,0,b
2,0,0,0,0,1,0,0,0,0,0,v
3,0,0,1,0,0,0,0,0,0,0,l
4,0,1,0,0,0,0,0,0,0,0,s
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,0,0,0,0,h
4205,1,0,0,0,0,0,0,0,0,0,aa
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,1,0,0,0,0,0,v
