# Count_frequency_encoding

In [5]:
import pandas as pd
import numpy as np

# let's open the mercedes benz dataset for demonstration
# Download the dataset from the below link
#https://www.kaggle.com/aditya1702/mercedes-benz-data-exploration/data

df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2'])
df.head()

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n


In [6]:
df.shape

(4209, 2)

## One hot Encoding

In [7]:
pd.get_dummies(df).shape

(4209, 71)

In [8]:
len(df['X1'].unique())

27

In [9]:
len(df['X2'].unique())

44

In [10]:
# let's have a look at how many labels

for col in df.columns[0:]:
    print(col, ': ', len(df[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels


In [11]:
# let's obtain the counts for each one of the labels in variable X2
# let's capture this in a dictionary that we can use to re-map the labels

df.X2.value_counts().to_dict()

{'as': 1659,
 'ae': 496,
 'ai': 415,
 'm': 367,
 'ak': 265,
 'r': 153,
 'n': 137,
 's': 94,
 'f': 87,
 'e': 81,
 'aq': 63,
 'ay': 54,
 'a': 47,
 't': 29,
 'i': 25,
 'k': 25,
 'b': 21,
 'ao': 20,
 'z': 19,
 'ag': 19,
 'd': 18,
 'ac': 13,
 'g': 12,
 'ap': 11,
 'y': 11,
 'x': 10,
 'aw': 8,
 'h': 6,
 'at': 6,
 'q': 5,
 'an': 5,
 'al': 5,
 'av': 4,
 'p': 4,
 'ah': 4,
 'au': 3,
 'c': 1,
 'aa': 1,
 'j': 1,
 'l': 1,
 'o': 1,
 'ar': 1,
 'am': 1,
 'af': 1}

In [12]:
# And now let's replace each label in X2 by its count

# first we make a dictionary that maps each label to the counts
df_frequency_map = df.X2.value_counts().to_dict()
df.head(100)

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n
...,...,...
95,b,m
96,l,as
97,aa,as
98,b,m


In [13]:
# and now we replace X2 labels in the dataset df
df.X2 = df.X2.map(df_frequency_map)

df.head()

Unnamed: 0,X1,X2
0,v,6
1,t,4
2,w,137
3,t,137
4,v,137


# OHE_variables_with_many_labels

In [14]:
import pandas as pd
import numpy as np



df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2'])
df.head()

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n


In [15]:
for col in df:
    print(df[col].unique())

['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']


In [16]:
df['X1'].unique()

array(['v', 't', 'w', 'b', 'r', 'l', 's', 'aa', 'c', 'a', 'e', 'h', 'z',
       'j', 'o', 'u', 'p', 'n', 'i', 'y', 'd', 'f', 'm', 'k', 'g', 'q',
       'ab'], dtype=object)

In [17]:
len(df['X1'].unique())

27

In [18]:
df['X2'].unique()

array(['at', 'av', 'n', 'e', 'as', 'aq', 'r', 'ai', 'ak', 'm', 'a', 'k',
       'ae', 's', 'f', 'd', 'ag', 'ay', 'ac', 'ap', 'g', 'i', 'aw', 'y',
       'b', 'ao', 'al', 'h', 'x', 'au', 't', 'an', 'z', 'ah', 'p', 'am',
       'j', 'q', 'af', 'l', 'aa', 'c', 'o', 'ar'], dtype=object)

In [19]:
len(df['X2'].unique())

44

In [20]:
# let's have a look at how many labels each variable has

for col in df.columns:
    print(col, ': ', len(df[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels


In [21]:
df.shape

(4209, 2)

In [22]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(df, drop_first=True).shape

(4209, 69)

In [24]:
# let's make a list with the most frequent categories of the variable

top_10_labels = [y for y in df.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [25]:
# get whole set of dummy variables, for all the categorical variables

def one_hot_encoding_top_x(df, variable, top_x_labels):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)

In [26]:
# read the data again
df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2'])

# encode X2 into the 10 most frequent categories
one_hot_encoding_top_x(df, 'X2', top_10_labels)
df.head()

Unnamed: 0,X1,X2,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,0,0,0,0,0,0,0,0,0,0
1,t,av,0,0,0,0,0,0,0,0,0,0
2,w,n,0,0,0,0,0,0,1,0,0,0
3,t,n,0,0,0,0,0,0,1,0,0,0
4,v,n,0,0,0,0,0,0,1,0,0,0


# Ordinal Encoding

In [27]:
df = pd.read_csv('customer.csv')
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
29,83,Female,Average,UG,Yes
30,73,Male,Average,UG,No
23,96,Female,Good,School,No
47,38,Female,Good,PG,Yes
25,57,Female,Good,School,No


In [28]:
df = df.iloc[:,2:]
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [32]:
X = df.drop('purchased', axis=1)
y = df.purchased
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [33]:
X_train

Unnamed: 0,review,education
24,Average,PG
48,Good,UG
17,Poor,UG
12,Poor,School
27,Poor,PG
33,Good,PG
16,Poor,UG
2,Good,PG
25,Good,School
14,Poor,PG


In [34]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])
oe.fit(X_train)

OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

In [35]:
X_train = oe.transform(X_train)
X_train

array([[1., 2.],
       [2., 1.],
       [0., 1.],
       [0., 0.],
       [0., 2.],
       [2., 2.],
       [0., 1.],
       [2., 2.],
       [2., 0.],
       [0., 2.],
       [1., 1.],
       [1., 0.],
       [1., 1.],
       [0., 1.],
       [1., 1.],
       [0., 0.],
       [2., 2.],
       [0., 2.],
       [1., 2.],
       [2., 1.],
       [1., 1.],
       [2., 0.],
       [2., 2.],
       [1., 0.],
       [0., 2.],
       [2., 0.],
       [1., 2.],
       [2., 2.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [2., 1.],
       [2., 1.],
       [2., 0.],
       [0., 2.],
       [0., 2.],
       [1., 1.],
       [0., 2.],
       [0., 1.],
       [2., 0.]])

In [36]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [37]:
X_train

array([[1., 2.],
       [2., 1.],
       [0., 1.],
       [0., 0.],
       [0., 2.],
       [2., 2.],
       [0., 1.],
       [2., 2.],
       [2., 0.],
       [0., 2.],
       [1., 1.],
       [1., 0.],
       [1., 1.],
       [0., 1.],
       [1., 1.],
       [0., 0.],
       [2., 2.],
       [0., 2.],
       [1., 2.],
       [2., 1.],
       [1., 1.],
       [2., 0.],
       [2., 2.],
       [1., 0.],
       [0., 2.],
       [2., 0.],
       [1., 2.],
       [2., 2.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [2., 1.],
       [2., 1.],
       [2., 0.],
       [0., 2.],
       [0., 2.],
       [1., 1.],
       [0., 2.],
       [0., 1.],
       [2., 0.]])

# Label Encoding

In [38]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)

LabelEncoder()

In [39]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [40]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)
y_train

array([1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0])

# One Hot Encoding

In [41]:
import numpy as np
import pandas as pd
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [42]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

## 1. OneHotEncoding using Pandas

In [43]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


## 2. K-1 OneHotEncoding

In [44]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## 3. OneHotEncoding using Sklearn

In [45]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [46]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.transform(X_test[['fuel','owner']])
X_train_new.shape

(6502, 7)

In [47]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

## 4. OneHotEncoding with Top Categories

In [48]:
counts = df['brand'].value_counts()
df['brand'].nunique()
threshold = 100
repl = counts[counts <= threshold].index

In [49]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
5745,0,1,0,0,0,0,0,0,0,0,0,0,0
6632,0,0,0,0,0,0,0,0,1,0,0,0,0
466,0,1,0,0,0,0,0,0,0,0,0,0,0
4030,0,0,0,0,0,0,0,0,0,0,1,0,0
4091,0,0,0,0,0,0,0,0,0,0,0,0,1
