In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("customer.csv")

In [None]:
df.sample(10)

Unnamed: 0,age,gender,review,education,purchased
4,16,Female,Average,UG,No
24,16,Female,Average,PG,Yes
42,30,Female,Good,PG,Yes
47,38,Female,Good,PG,Yes
30,73,Male,Average,UG,No
35,74,Male,Poor,School,Yes
20,57,Female,Average,School,Yes
25,57,Female,Good,School,No
32,92,Male,Average,UG,Yes
26,53,Female,Poor,PG,No


In [None]:
df.shape

(50, 5)

In [None]:
df['review'].value_counts()

Poor       18
Good       18
Average    14
Name: review, dtype: int64

In [None]:
df['purchased'].value_counts()

No     26
Yes    24
Name: purchased, dtype: int64

In [None]:
df['education'].value_counts()

PG        18
School    16
UG        16
Name: education, dtype: int64

In [None]:
#we are dropping off the first two columns
df = df.iloc[:,2:]

In [None]:
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [None]:
#now creating train,test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:2],df.iloc[:,-1],test_size=0.2)

In [None]:
X_train.head()

Unnamed: 0,review,education
24,Average,PG
37,Average,PG
49,Good,UG
17,Poor,UG
14,Poor,PG


In [None]:
X_train.shape

(40, 2)

## Ordinal Encoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder #making use of ordinal encoder

In [None]:
#now we will encode our categorical values
#here we will pass proper order of categories
oe1 = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

In [None]:
oe1.fit(X_train)

In [None]:
X_train_1 = oe1.transform(X_train)
X_test_1 = oe1.transform(X_test)

In [None]:
X_train_1

array([[1., 2.],
       [1., 2.],
       [2., 1.],
       [0., 1.],
       [0., 2.],
       [0., 1.],
       [2., 1.],
       [1., 0.],
       [2., 2.],
       [2., 2.],
       [2., 2.],
       [0., 2.],
       [2., 0.],
       [0., 0.],
       [2., 0.],
       [1., 0.],
       [1., 1.],
       [2., 1.],
       [1., 2.],
       [2., 2.],
       [0., 2.],
       [2., 0.],
       [1., 0.],
       [0., 2.],
       [2., 2.],
       [1., 0.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [0., 0.],
       [2., 0.],
       [1., 0.],
       [2., 0.],
       [0., 1.],
       [2., 0.],
       [0., 2.],
       [0., 0.],
       [0., 1.],
       [0., 2.],
       [1., 1.]])

In [None]:
oe1.get_params()

{'categories': [['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']],
 'dtype': numpy.float64,
 'encoded_missing_value': nan,
 'handle_unknown': 'error',
 'unknown_value': None}

In [None]:
oe1.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [None]:
#now we won't give any order
oe2 = OrdinalEncoder()

In [None]:
oe2.fit(X_train)

In [None]:
X_train_2 = oe2.transform(X_train)
X_test_2 = oe2.transform(X_test)

In [None]:
X_train_2 #as you can see if we don't provide proper ordering of categories, the ordinal encoder will assign values randomly to the categories

array([[0., 0.],
       [0., 0.],
       [1., 2.],
       [2., 2.],
       [2., 0.],
       [2., 2.],
       [1., 2.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [2., 0.],
       [1., 1.],
       [2., 1.],
       [1., 1.],
       [0., 1.],
       [0., 2.],
       [1., 2.],
       [0., 0.],
       [1., 0.],
       [2., 0.],
       [1., 1.],
       [0., 1.],
       [2., 0.],
       [1., 0.],
       [0., 1.],
       [2., 0.],
       [2., 0.],
       [1., 0.],
       [2., 1.],
       [1., 1.],
       [0., 1.],
       [1., 1.],
       [2., 2.],
       [1., 1.],
       [2., 0.],
       [2., 1.],
       [2., 2.],
       [2., 0.],
       [0., 2.]])

## Label Encoder

In [None]:
#encoding target variable with label encoding
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
le.fit(y_train)

In [None]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [None]:
y_train_1 = le.transform(y_train)
y_test_1 = le.transform(y_test)

In [None]:
y_train.head()

24    Yes
37    Yes
49     No
17    Yes
14    Yes
Name: purchased, dtype: object

In [None]:
y_train_1 #here we can observe that Yes is encoded as 1 while No is encoded as 0

array([1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

## One Hot Encoder

In [None]:
#Now we will use a different dataset for performing one hot encoding
cars = pd.read_csv('cars.csv')

In [None]:
cars.sample(10)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
1381,Maruti,65000,Petrol,Second Owner,175000
6178,Maruti,69779,Petrol,First Owner,600000
891,Mahindra,50000,Diesel,Second Owner,1300000
4532,Maruti,65000,Petrol,Second Owner,260000
6013,Volkswagen,7949,Petrol,First Owner,579000
3330,Maruti,70000,Petrol,Third Owner,199000
4406,Skoda,35000,Diesel,First Owner,700000
1284,Mahindra,120000,Diesel,Third Owner,229999
4272,Skoda,11000,Petrol,First Owner,645000
1318,Hyundai,50000,Petrol,Second Owner,465000


In [None]:
cars['brand'].value_counts() #here you can see that the first 10-12 categories are dominant while other catgories have a small number of cars

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [None]:
cars['brand'].nunique()

32

In [None]:
cars['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [None]:
cars['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

In [None]:
cars.shape

(8128, 5)

So initially we will try to one hot encode the fuel and owner columns, then we will one hot encode brand column because it has a bit more categories

### OHE using Pandas

In [None]:
pd.get_dummies(cars,columns=['fuel','owner']) #this does not manipulate the original dataset

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


### Encoding K-1 categories

In [None]:
pd.get_dummies(cars,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


### OHE using Sklearn

In [None]:
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(cars.iloc[:,0:4],cars.iloc[:,-1],test_size=0.2,random_state=42)

In [None]:
X_train_ohe.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560,Petrol,First Owner
6144,Honda,80000,Petrol,Second Owner
6381,Hyundai,150000,Diesel,Fourth & Above Owner
438,Maruti,120000,Diesel,Second Owner
5939,Maruti,25000,Petrol,First Owner


In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder(drop='first',dtype=np.int32,sparse_output=False)

In [None]:
#Now we will encode fuel and owner for now , then we will encode brand as well
X_train_ohe_new = ohe.fit_transform(X_train_ohe[['fuel','owner']])

In [None]:
X_train_ohe_new

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]], dtype=int32)

In [None]:
X_test_ohe_new = ohe.fit_transform(X_test_ohe[['fuel','owner']])

In [None]:
type(X_train_ohe_new)

numpy.ndarray

In [None]:
X_train_ohe_new.shape

(6502, 7)

In [None]:
#now we have encoded fuel as well as owner columns so we have to merge it with brand and km_driven column
x = np.hstack((X_train_ohe[['brand','km_driven']].values,X_train_ohe_new))

In [None]:
x

array([['Tata', 2560, 0, ..., 0, 0, 0],
       ['Honda', 80000, 0, ..., 1, 0, 0],
       ['Hyundai', 150000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 35000, 0, ..., 0, 0, 0],
       ['Maruti', 27000, 1, ..., 0, 0, 0],
       ['Maruti', 70000, 0, ..., 1, 0, 0]], dtype=object)

In [None]:
x.shape

(6502, 9)

### OHE "Brand" category

In [None]:
#Here what we will do is since, the number of categories are greater in number, encoding each and every category will lead to drastic increase in the dimension of the newly created dataset
#So what we will do is , we try to lower the number of categories

In [None]:
cars['brand'].value_counts() #here we will create a new category called "others" that will hold all the brands having number of cars less than 100

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [None]:
cnt = cars['brand'].value_counts()

In [None]:
others = cnt[cnt <= 100].index

In [None]:
others #this represents the car brands having less than 100 cars

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [None]:
pd.get_dummies(cars['brand'].replace(others,'Uncommon_Others')).sample(10)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Uncommon_Others,Volkswagen
880,0,0,0,0,0,0,1,0,0,0,0,0,0
1705,0,0,0,0,1,0,0,0,0,0,0,0,0
2677,1,0,0,0,0,0,0,0,0,0,0,0,0
4539,0,0,0,0,0,0,0,0,0,0,0,1,0
6585,0,0,0,0,0,0,1,0,0,0,0,0,0
6161,0,0,0,0,0,0,1,0,0,0,0,0,0
3777,0,0,1,0,0,0,0,0,0,0,0,0,0
6054,0,0,0,0,0,0,0,0,0,1,0,0,0
4879,0,0,0,0,1,0,0,0,0,0,0,0,0
7324,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
cars.iloc[4539,:]

brand                 Nissan
km_driven              55000
fuel                  Diesel
owner            First Owner
selling_price         850000
Name: 4539, dtype: object