In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder

In [49]:
df=pd.read_csv('Dataset/customer.csv')
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


## Ordinal Encoding on review and education
## Label encoding on purchased column

In [50]:
df=df.iloc[:,2:]
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [51]:
from sklearn.model_selection import train_test_split
x_train,X_test,y_train,y_test=train_test_split(df.iloc[:,0:2],df.iloc[:,-1],test_size=0.2)

In [52]:
x_train.head()

Unnamed: 0,review,education
10,Good,UG
4,Average,UG
26,Poor,PG
9,Good,UG
39,Poor,PG


In [53]:
oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])
# poor-> less value, averge-> more than poor, good has more value

In [54]:
oe.fit(x_train)

OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

In [55]:
x_train=oe.transform(x_train)
X_test=oe.transform(X_test)

In [56]:
x_train

array([[2., 1.],
       [1., 1.],
       [0., 2.],
       [2., 1.],
       [0., 2.],
       [0., 2.],
       [2., 0.],
       [1., 1.],
       [1., 1.],
       [2., 1.],
       [0., 2.],
       [2., 2.],
       [1., 0.],
       [0., 1.],
       [2., 2.],
       [2., 0.],
       [0., 0.],
       [1., 0.],
       [1., 1.],
       [2., 0.],
       [0., 1.],
       [0., 2.],
       [1., 2.],
       [1., 0.],
       [0., 2.],
       [2., 1.],
       [0., 1.],
       [2., 2.],
       [0., 0.],
       [1., 0.],
       [2., 2.],
       [0., 2.],
       [0., 2.],
       [2., 0.],
       [0., 0.],
       [1., 2.],
       [2., 0.],
       [1., 0.],
       [1., 2.],
       [2., 2.]])

In [57]:
le=LabelEncoder()

In [58]:
le.fit(y_train)

LabelEncoder()

In [59]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [60]:
y_train=le.transform(y_train)
y_test=le.transform(y_test)

In [61]:
y_train

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1])

## Nominal categorical data - One Hot encoding

In [62]:
df=pd.read_csv('Dataset/cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [63]:
df['brand'].nunique()

32

## 1. OneHotEncoding using Pandas

In [64]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


## 2. k-1 OneHotEncoding
- remove one column due to multicollinearity

In [65]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## 3. Using Sklearn

In [66]:
X_train,X_test,y_train,y_test=train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=3)

In [67]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
774,Hyundai,110000,Petrol,Second Owner
5975,Toyota,80000,Diesel,Second Owner
7817,Tata,88000,Diesel,First Owner
2889,Maruti,50000,Diesel,First Owner
414,Toyota,68089,Petrol,First Owner


In [68]:
from sklearn.preprocessing import OneHotEncoder

In [69]:
ohe=OneHotEncoder(drop='first')
# ohe=OneHotEncoder(drop='first',dtype=np.int32,sparse=False)
# if sparse is false then we don't have to convert to numpy array

In [70]:
X_train_new=ohe.fit_transform(X_train[['fuel','owner']]).toarray()
X_test_new=ohe.fit_transform(X_test[['fuel','owner']]).toarray()

In [71]:
X_train_new

array([[0., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [72]:
X_train[['brand','km_driven']].values

array([['Hyundai', 110000],
       ['Toyota', 80000],
       ['Tata', 88000],
       ...,
       ['Audi', 98000],
       ['Nissan', 55000],
       ['Honda', 30000]], dtype=object)

In [73]:
# horizentally stack X_train and X_train_new
np.hstack((X_train[['brand','km_driven']].values,X_train_new)).shape

(6502, 9)

## 4. OneHotEncoding with Top Categories

In [74]:
counts=df['brand'].value_counts()


In [75]:
# less than 100 aggregate to make categories called as other
df['brand'].nunique
threshold=100

In [76]:
replace=counts[counts<=threshold].index

In [77]:
pd.get_dummies(df['brand'].replace(replace,'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
2489,0,0,0,0,0,0,1,0,0,0,0,0,0
3134,0,0,0,0,1,0,0,0,0,0,0,0,0
7531,0,0,0,0,0,0,0,0,0,1,0,0,0
8094,0,0,0,0,0,0,0,0,0,1,0,0,0
5898,0,0,0,0,0,0,0,0,0,0,1,0,0
