## ONE HOT ENCODING

<b>With one-hot, we convert each categorical value into a new categorical column and assign a binary value of 1 or 0 to those columns. Each integer value is represented as a binary vector.</b>

In [1]:
import pandas as pd

In [3]:
df=pd.read_csv('titanic_train.csv',usecols=['Sex'])

In [4]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [5]:
pd.get_dummies(df,drop_first=True).head() # Get dummies will change the categorical labels into 1 and 0

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [6]:
df1=pd.read_csv('titanic_train.csv',usecols=['Embarked'])

In [8]:
df1.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [9]:
df1['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
df1.dropna(inplace=True)

In [11]:
pd.get_dummies(df1,drop_first=True).head(10)

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
5,1,0
6,0,1
7,0,1
8,0,1
9,0,0


In [12]:
#### Onehotencoding with many categories/labels in a feature

In [13]:
df2=pd.read_csv('benz.csv')
df2

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,aj,h,as,f,d,aa,j,e,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,t,aa,ai,d,d,aa,j,y,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,y,v,as,f,d,aa,d,w,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,ak,v,as,a,d,aa,c,q,0,...,0,0,1,0,0,0,0,0,0,0


In [14]:
df2=pd.read_csv('benz.csv',usecols=['X0','X1','X2','X3','X4','X5','X6'])
df2.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,az,v,n,f,d,t,a
1,t,b,ai,a,d,b,g
2,az,v,as,f,d,a,j
3,az,l,n,f,d,z,l
4,w,s,as,c,d,y,i


In [20]:
# printing unique labels/categories for each column
for i in df2.columns:
    print('Unique Labels of ',i,'=',len(df2[i].unique()))

Unique Labels of  X0 = 49
Unique Labels of  X1 = 27
Unique Labels of  X2 = 45
Unique Labels of  X3 = 7
Unique Labels of  X4 = 4
Unique Labels of  X5 = 32
Unique Labels of  X6 = 12


In [22]:
#if you have many labels then  u will only take Top 10 labels 
df2.X1.value_counts().sort_values(ascending=False).head(10)

aa    826
s     602
l     599
b     596
v     436
r     252
i     189
a     153
c     142
o      81
Name: X1, dtype: int64

In [39]:
#taking the top 10 indexes of X1 in a list
lst_10=df2.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10

Index(['aa', 's', 'l', 'b', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object')

In [40]:
lst_10=list(lst_10) # typecasting into a list

In [41]:
lst_10

['aa', 's', 'l', 'b', 'v', 'r', 'i', 'a', 'c', 'o']

In [42]:
import numpy as np
for category in lst_10:
    df2[category]=np.where(df2['X1']==category,1,0)

In [44]:
lst_10.append('X1') # appending the X1 For comparing in lst_10

In [45]:
lst_10

['aa', 's', 'l', 'b', 'v', 'r', 'i', 'a', 'c', 'o', 'X1']

In [47]:
df2

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,l,b,v,r,i,a,c,o
0,az,v,n,f,d,t,a,0,0,0,0,1,0,0,0,0,0
1,t,b,ai,a,d,b,g,0,0,0,1,0,0,0,0,0,0
2,az,v,as,f,d,a,j,0,0,0,0,1,0,0,0,0,0
3,az,l,n,f,d,z,l,0,0,1,0,0,0,0,0,0,0
4,w,s,as,c,d,y,i,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,aj,h,as,f,d,aa,j,0,0,0,0,0,0,0,0,0,0
4205,t,aa,ai,d,d,aa,j,1,0,0,0,0,0,0,0,0,0
4206,y,v,as,f,d,aa,d,0,0,0,0,1,0,0,0,0,0
4207,ak,v,as,a,d,aa,c,0,0,0,0,1,0,0,0,0,0


In [46]:
df2[lst_10] # Taking the Lst_10  encoded categories from the df2

Unnamed: 0,aa,s,l,b,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,1,0,0,0,0,0,0,b
2,0,0,0,0,1,0,0,0,0,0,v
3,0,0,1,0,0,0,0,0,0,0,l
4,0,1,0,0,0,0,0,0,0,0,s
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,0,0,0,0,h
4205,1,0,0,0,0,0,0,0,0,0,aa
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,1,0,0,0,0,0,v


In [48]:
# Now You can see  that where the label is present at that place it is 1 and remaining 0 