# Encoding: Transforming the categorical data into numeric data 

## Types of Feature Scaling
1. Ordinal Encoding
2. Label Encoding
3. One Hot Encoding

## Ordinal Encoding
Used to encode independent ordinal features

In [125]:
import numpy as np
import pandas as pd

In [126]:
df = pd.read_csv("Data/customer.csv")
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
23,96,Female,Good,School,No
37,94,Male,Average,PG,Yes
44,77,Female,Average,UG,No
15,75,Male,Poor,UG,No
25,57,Female,Good,School,No


In [127]:
df = df.iloc[:,2:]

In [128]:
df.sample(5)

Unnamed: 0,review,education,purchased
46,Poor,PG,No
20,Average,School,Yes
27,Poor,PG,No
38,Good,School,No
29,Average,UG,Yes


In [129]:
# splitting the data into independent and dependent features
X = df.iloc[:,:2]
y = df.iloc[:,-1]

In [130]:
from sklearn.model_selection import train_test_split

# splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [131]:
from sklearn.preprocessing import OrdinalEncoder

# creating OheHotEnoder object
# categories parameter takes in the variables in the order which you want to encode
# here Poor, Average and Good variables are from review feature and School, UG and PG variables are from education feature
oe = OrdinalEncoder(categories = [['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

# fit method is used to learn the patterns from the data
oe.fit(X_train)

# transform method is used to apply the transformation on the data based on the learnings from the pattern
X_train_encoded = oe.transform(X_train)
X_test_encoded = oe.transform(X_test)

In [132]:
# list down the categories of the independent features
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

## Label Encoding
Used to encode ouput/dependent feature

In [133]:
from sklearn.preprocessing import LabelEncoder

# creating LabelEncoder object
le = LabelEncoder()

# fit method is used to learn the patterns from the data
le.fit(y_train)

# transform method is used to apply the transformation on the data based on the learnings from the pattern
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

# NOTE: You cannot decide the order of labels like you can do in ordinal encoding

In [134]:
# list down the labels/classes of the dependent feature
le.classes_

array(['No', 'Yes'], dtype=object)

## One Hot Encoding
Used to encode independent nominal features

In [135]:
import numpy as np
import pandas as pd

In [136]:
df = pd.read_csv("Data/cars.csv")
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
1142,Honda,56494,Petrol,First Owner,550000
6072,Mahindra,110000,Diesel,First Owner,250000
6675,Hyundai,10000,Petrol,First Owner,610000
3831,Jaguar,9000,Diesel,First Owner,2711000
5807,Ford,163000,Diesel,First Owner,180000


In [137]:
# splitting the data into independent and dependent features
X = df.iloc[:,:4]
y = df.iloc[:,-1]

In [138]:
from sklearn.model_selection import train_test_split

# splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [139]:
from sklearn.preprocessing import OneHotEncoder

# creating OheHotEnoder object
# drop parameter is used to drop one column from the encoded columns to avoid multicollinearity
# sparce_output = False is used to convert the output into numpy array
ohe = OneHotEncoder(drop = 'first', sparse_output = False)

X_train_encoded = ohe.fit_transform(X_train[['fuel', 'owner']])
X_test_encoded = ohe.transform(X_test[['fuel', 'owner']])

X_train_encoded.shape

(6502, 7)

In [140]:
# converting X_train into numpy array
X_train = X_train.iloc[:,:2].values

# joining two tables (X_train and X_train_encoded)
X_train = np.hstack((X_train, X_train_encoded))

X_train.shape

(6502, 9)

In [142]:
X_train = pd.DataFrame(X_train)

In [144]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Hyundai,60000,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Tata,150000,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Hyundai,110000,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Mahindra,28000,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Maruti,15000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
