# One Hot Encoding - Variables with many categories

In [2]:
#importing the libraries

import pandas as pd
import numpy as np

In [5]:
# reading the dataset

df=pd.read_csv('mercedesbenz.csv')

In [6]:
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# separing the categorical columns

cat_columns=[feature for feature in df.columns if df[feature].dtype=='O']

cat_columns

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

In [19]:
df[cat_columns].head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [22]:
# how many unique categories are there in each column

for col in cat_columns:
    print('{} : {}'.format(col,len(df[col].unique())))

X0 : 47
X1 : 27
X2 : 44
X3 : 7
X4 : 4
X5 : 29
X6 : 12
X8 : 25


In [24]:
df[cat_columns].shape

(4209, 8)

In [30]:
# lets examine how many columns we will obtain after one hot encoding these variables

pd.get_dummies(df[cat_columns],drop_first=True).shape

(4209, 187)

we can see that from 8 intital categorical variable, we end up with 187 new variables,


### KDD Cup Orange Challenge

What can we do instead?

http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble selection the authors limit one hot encoding to the 10 most frequent labels of the variable. This means that they would make one binary variable for each of the 10most frequent labels only. This is equivalent to grouping all the other labels under a new category, that in this case will be dropped. Thus, the 10 new dummy variables indicate if one of the 10 most frequent labels is present(1) or not (0) for a particular observation.

The Team suggested using 10 most frequent labels convert them into dummy variables using onehotencoding

How can we do that in python?

In [34]:
# lets find the top 10 most frequent categories for the variable X0
## this 10 categories gonna be 1 and others gonna be 0

df['X0'].value_counts().sort_values(ascending=False).head(10)

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
Name: X0, dtype: int64

In [36]:
# lets make a list with the most frequent categories of the variable

top10=[x for x in df['X0'].value_counts().sort_values(ascending=False).head(10).index]
top10

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']

In [45]:
# now we make the 10 binary variables

for label in top10:
   df[label]=np.where(df['X0']==label,1,0)
    
df[['X0']+top10].head(40)

Unnamed: 0,X0,z,ak,y,ay,t,x,o,f,n,w
0,k,0,0,0,0,0,0,0,0,0,0
1,k,0,0,0,0,0,0,0,0,0,0
2,az,0,0,0,0,0,0,0,0,0,0
3,az,0,0,0,0,0,0,0,0,0,0
4,az,0,0,0,0,0,0,0,0,0,0
5,t,0,0,0,0,1,0,0,0,0,0
6,al,0,0,0,0,0,0,0,0,0,0
7,o,0,0,0,0,0,0,1,0,0,0
8,w,0,0,0,0,0,0,0,0,0,1
9,j,0,0,0,0,0,0,0,0,0,0


In [49]:
# lets do this for all set of categorical variable

def one_hot_top_x(df,variable,top_x_labels):
    #function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label]=np.where(df[variable]==label,1,0)

In [50]:
# read the data again

df=pd.read_csv('mercedesbenz.csv', usecols=['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'])

In [56]:
#encode X0 into the 10 most frequent categories

one_hot_top_x(df,'X0',top10)
df.head(40)

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X0_z,X0_ak,X0_y,X0_ay,X0_t,X0_x,X0_o,X0_f,X0_n,X0_w
0,k,v,at,a,d,u,j,o,0,0,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,e,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,0,0,0,0,0,0,0,0
5,t,b,e,c,d,g,h,s,0,0,0,0,1,0,0,0,0,0
6,al,r,e,f,d,f,h,s,0,0,0,0,0,0,0,0,0,0
7,o,l,as,f,d,f,j,a,0,0,0,0,0,0,1,0,0,0
8,w,s,as,e,d,f,i,h,0,0,0,0,0,0,0,0,0,1
9,j,b,aq,c,d,f,a,e,0,0,0,0,0,0,0,0,0,0


## Advantages

1. Straightforward to implement
2. Does not requre hrs of variable exploration
3. Does not expand massively the feature space ( number of columns in the dataset )

## Disadvantage

1. Does not add any information that may make the variable more predictive
2. Does not keep the information of the ignored label