In [118]:
import numpy as np
import pandas as pd

In [119]:
df = pd.read_csv('cars.csv')

In [120]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [121]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

# 1. OneHotEncoding using Pandas

In [123]:
pd.get_dummies(df, columns=['fuel', 'owner']) # not performing on owner column because it has many unique categories

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# 2. K-1 OneHotEncoding

In [125]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True) # n-1 categories

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# 3. OHE using Sklearn

In [127]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, -1], test_size = 0.2, random_state = 2)

In [128]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [129]:
from sklearn.preprocessing import OneHotEncoder

In [150]:
ohe = OneHotEncoder(drop='first', dtype=np.int32)

In [152]:
X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']]).toarray()

In [154]:
X_test_new = ohe.transform(X_test[['fuel', 'owner']]).toarray()

In [156]:
X_train_new.shape

(6502, 7)

In [160]:
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

# OHE with Top Categories


It is performed when categories types are very high


In [165]:
counts = df['brand'].value_counts()

In [167]:
# creating a threshold where if category's value count < 100 will be considered under 'other' category
df['brand'].nunique()
threshold = 100

In [169]:
# categories with < threshold
repl = counts[counts <= threshold].index

In [171]:
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [175]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(20)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
1695,False,False,False,False,False,True,False,False,False,False,False,False,False
2717,False,False,False,False,False,False,True,False,False,False,False,False,False
4556,False,False,False,False,False,False,False,False,False,True,False,False,False
2849,False,False,False,False,False,False,True,False,False,False,False,False,False
8026,False,False,False,False,False,False,False,False,False,True,False,False,False
2624,False,False,False,False,False,False,True,False,False,False,False,False,False
6840,False,False,False,False,True,False,False,False,False,False,False,False,False
2435,False,False,True,False,False,False,False,False,False,False,False,False,False
7897,False,False,False,False,False,False,True,False,False,False,False,False,False
4155,False,False,False,False,False,False,False,False,False,True,False,False,False
