# One Hot Encoding -variables with many categories 

In [1]:
import pandas as pd
import numpy as np
#let's Load the mercedes benz dataset for demostration ,only the categorical varisables

data=pd.read_csv('mercedesbenz.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [1]:
#try to find out how many uniques category are in each and every column 
for col in data.columns :
    print(col,':',len(data[col].unique()),'labels')
#data.columns returns a list of all column names in the DataFrame data.
#data[col] Refers to the data in the current column 
#data[col].unique()Returns a NumPy array of all unique values in that column.
#len(data[col].unique()) Counts the number of distinct (unique) values in the column.

NameError: name 'data' is not defined

In [3]:
#Let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(data,drop_first=True).shape
#pd.get_dummies() This function performs one-hot encoding on categorical columns in the DataFrame data.






(4209, 117)

we can see that from just 6 initial categorical variables ,we end up with 117 new variables 

what can we do instead?


http://proceedings.mir.press/v7/niculescu09/niculescu09.pdf In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble Selection the authors limit one hot encoding to the 10 most frequent labels of the variable. This means that they would make one binary variable for each of the 10 most frequent labels only. This is equivalent to grouping all the other labels under a new category, that in this case will be dropped. Thus, the 10 new dummy variables indicate if one of the 10 most frequent labels is present (1) or not (0) for a particular observation.

In [4]:
#let's find the top 10 most frequent categories for the variable X2

data.X2.value_counts().sort_values(ascending=False).head(20)


X2
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
z       19
ag      19
Name: count, dtype: int64

In [5]:
#Let's make a List with the most frequent categories of the variable 

top_10=[x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [6]:
#and now we make the top 10 binary variables

for label in top_10 :
    data[label]=np.where(data['X2']==label,1,0)
data[['X2']+top_10].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [7]:
#get whole set of dummy variables,for all the categorical variables

def one_hot_top_x(df,variable,top_x_labels):
    #function to create the dummy variables for the most frequent labels
    #we can vary the number of most frequent labels that we encode 

    for label in top_x_labels:
        df[variable+'_'+label]=np.where(data[variable]==label,1,0)

#read the data again 
data=pd.read_csv('mercedesbenz.csv',usecols=['X1','X2','X3','X4','X5','X6'])

#encode  x2intothe 10 most frequent categories
one_hot_top_x(data,'X2',top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


In [9]:
#find the 10 most frequent categories for X1
top_10=[x for x in data.X1.value_counts().sort_values(ascending=False).head(10).index]

# now create the 10most frequent dummy variables for X1
one_hot_top_x(data,'X1',top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X1_f,X1_e,X1_aa,X1_b,X1_l,X1_v,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
