In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('customer.csv')

In [4]:
data.sample(5)

Unnamed: 0,age,gender,review,education,purchased
31,22,Female,Poor,School,Yes
16,59,Male,Poor,UG,Yes
13,57,Female,Average,School,No
42,30,Female,Good,PG,Yes
0,30,Female,Average,School,No


Before doing encoding we will check what kind of categorical data is it <b>Nominal</b> or <b>Ordinal</b>
1. gender: Nominal
2. review: Ordinal
3. education: Ordinal
4. purchased: Nominal

# Important
One thing to keep in mind before doing encoding we have to first train test split the data and then fit encoder on train data and apply transform on both training and testing data

In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(data.drop('purchased',axis=1),data['purchased'],test_size=0.2,random_state=0)

In [6]:
X_train.head()

Unnamed: 0,age,gender,review,education
33,89,Female,Good,PG
35,74,Male,Poor,School
26,53,Female,Poor,PG
34,86,Male,Average,School
18,19,Male,Good,School


Now Apply the encoding on categorical column

In [7]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])
cols = ['review','education']
oe.fit(X_train[cols])
X_train[cols] = oe.transform(X_train[cols])
X_test[cols] = oe.transform(X_test[cols])

In [8]:
X_train.head()

Unnamed: 0,age,gender,review,education
33,89,Female,2.0,2.0
35,74,Male,0.0,0.0
26,53,Female,0.0,2.0
34,86,Male,1.0,0.0
18,19,Male,2.0,0.0


One thing to keep in mind is that <b>Label Encoding</b> is applied on only <b>output columns</b>

In [9]:
y_train.head()

33    Yes
35    Yes
26     No
34     No
18     No
Name: purchased, dtype: object

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test=le.transform(y_test)

In [11]:
y_train

array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0])

# One Hot Encoding

It is applied on <b>Nominal categorical Columns</b
                                                   >

In [12]:
df = pd.read_csv('cars.csv')

In [18]:
df['fuel'].unique()
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [20]:
# One hot encoding using pandas 
pd.get_dummies(df,columns=['fuel','owner'],dtype=int)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


To resolve the issue of <b>Multicollinearity</b> we will drop first column of every encoded column

In [21]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True,dtype=int)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


In [22]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


Pandas one hot encoding is useful for data analysis thing but for machine learning we need to use scikit learn using one hot encoding

In [23]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df.drop('selling_price',axis=1),df['selling_price'],test_size=0.2,random_state=0)

In [30]:
# now apply one hot encoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',dtype=int)
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()
X_test_new = ohe.transform(X_test[['fuel','owner']]).toarray()

In [34]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Hyundai,60000,0,1,0,0,0,0,0
1,Tata,150000,1,0,0,0,0,0,1
2,Hyundai,110000,1,0,0,0,1,0,0
3,Mahindra,28000,1,0,0,0,1,0,0
4,Maruti,15000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
6497,Tata,70000,1,0,0,0,0,0,1
6498,Ford,100000,1,0,0,0,1,0,0
6499,Hyundai,90000,0,0,1,0,1,0,0
6500,Volkswagen,90000,1,0,0,0,0,0,0


In [33]:
X_train

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
4138,Maruti,15000,Petrol,First Owner
...,...,...,...,...
4931,Tata,70000,Diesel,Third Owner
3264,Ford,100000,Diesel,Second Owner
1653,Hyundai,90000,Petrol,Second Owner
2607,Volkswagen,90000,Diesel,First Owner


<b>Important</b>
As we can see its a very hectic process to apply different encoding