## One hot encoding


In [1]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('datasets/cars.csv')
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
7575,Mahindra,70000,Diesel,Second Owner,750000
1357,Maruti,60000,Diesel,First Owner,455000
5360,Hyundai,25000,Diesel,First Owner,735000
7599,Maruti,29000,Petrol,First Owner,525000
5699,Toyota,150000,Diesel,First Owner,405000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [6]:
df.shape

(8128, 5)

### One Hot Encoding using Pandas

In [8]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


### K-1 OneHot Encoding

In [9]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first= True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


### OneHotEncoding using SKLearn

In [10]:
df

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000
...,...,...,...,...,...
8123,Hyundai,110000,Petrol,First Owner,320000
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :4], df.iloc[:, -1], test_size=0.3, random_state=0)

In [29]:
X_train.shape, X_test.shape

((5689, 4), (2439, 4))

In [30]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']]).toarray()
X_test_new = ohe.transform(X_test[['fuel', 'owner']]).toarray()

In [31]:
X_train, X_test_new

(           brand  km_driven    fuel         owner
 5224        Tata      20000  Petrol   First Owner
 520       Maruti      30000  Petrol   First Owner
 36        Maruti      15000  Petrol   First Owner
 5782        Ford      53000  Diesel   First Owner
 6522   Chevrolet     120000  Diesel   First Owner
 ...          ...        ...     ...           ...
 4931        Tata      70000  Diesel   Third Owner
 3264        Ford     100000  Diesel  Second Owner
 1653     Hyundai      90000  Petrol  Second Owner
 2607  Volkswagen      90000  Diesel   First Owner
 2732     Hyundai     110000  Petrol   First Owner
 
 [5689 rows x 4 columns],
 array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 1., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]]))

In [32]:
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new)).shape

(5689, 11)

## One Hot Encoding with top categories

In [33]:
counts = df['brand'].value_counts()
counts

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Ambassador          4
Kia                 4
MG                  3
Daewoo              3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [37]:
df['brand'].nunique()

32

In [40]:
repl = counts[counts <100].index

In [41]:
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Land', 'Force', 'Isuzu', 'Ambassador',
       'Kia', 'MG', 'Daewoo', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [45]:
df1 = df['brand'].replace(repl, 'uncommon')
df1.sample(10)

7300      Maruti
60      uncommon
6747    Mahindra
5722    Mahindra
3078    uncommon
7643    uncommon
6918        Ford
4132    Mahindra
1552    uncommon
1447    Mahindra
Name: brand, dtype: object

In [49]:
pd.get_dummies(data = df1, columns='brand').sample(10)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
2480,False,False,False,False,False,False,False,False,False,True,False,False,False
6535,False,False,False,False,False,False,True,False,False,False,False,False,False
945,False,False,False,False,False,False,True,False,False,False,False,False,False
7655,False,False,False,False,False,True,False,False,False,False,False,False,False
6751,False,False,False,False,False,False,True,False,False,False,False,False,False
4253,False,False,False,False,False,False,False,False,False,False,False,False,True
5835,False,False,False,False,True,False,False,False,False,False,False,False,False
6449,False,False,False,False,True,False,False,False,False,False,False,False,False
6518,False,False,False,False,False,False,False,False,False,True,False,False,False
4250,False,False,False,True,False,False,False,False,False,False,False,False,False
