# Encoding 
Converting Categorical Data to Numerical Data

### Ordinal Encoding 
Converting Input Ordinal Data ( good, better, best ) to numbers ( 1, 2, 3 )

### Label Encoding 
Converting the Output Ordinal Data to Numbers

### One Hot Encoding
Converting Nominal Data ( red, blue, green ) to numbers

# ORDINAL ENCODING

In [2]:
# imports
import numpy as np
import pandas as pd

In [6]:
# Load the dataset
df = pd.read_csv("customer.csv")
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [7]:
# Let's keep only the review, education and purchased column
df = df.iloc[:,2:]
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [11]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [12]:
# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder

# The categories 'Poor','Average','Good' should be written in this particular order so that they get a numerical value according to their order
oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

oe.fit(X_train)

X_train_encoded = oe.transform(X_train)
X_test_encoded = oe.transform(X_test)



In [16]:
X_train

Unnamed: 0,review,education
12,Poor,School
4,Average,UG
37,Average,PG
8,Average,UG
3,Good,PG
6,Good,School
41,Good,PG
46,Poor,PG
47,Good,PG
15,Poor,UG


In [17]:
X_train_encoded

array([[0., 0.],
       [1., 1.],
       [1., 2.],
       [1., 1.],
       [2., 2.],
       [2., 0.],
       [2., 2.],
       [0., 2.],
       [2., 2.],
       [0., 1.],
       [2., 1.],
       [0., 1.],
       [1., 2.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [1., 1.],
       [0., 2.],
       [2., 2.],
       [1., 0.],
       [1., 1.],
       [2., 1.],
       [2., 1.],
       [0., 1.],
       [1., 2.],
       [2., 2.],
       [0., 2.],
       [0., 0.],
       [2., 0.],
       [2., 0.],
       [2., 1.],
       [0., 2.],
       [2., 0.],
       [2., 1.],
       [1., 0.],
       [0., 0.],
       [2., 2.],
       [0., 2.],
       [0., 0.],
       [2., 0.]])

In [21]:
X_train_encoded = pd.DataFrame(X_train_encoded, columns=X_train.columns)
X_test_encoded = pd.DataFrame(X_train_encoded, columns=X_test.columns)
X_train_encoded

Unnamed: 0,review,education
0,0.0,0.0
1,1.0,1.0
2,1.0,2.0
3,1.0,1.0
4,2.0,2.0
5,2.0,0.0
6,2.0,2.0
7,0.0,2.0
8,2.0,2.0
9,0.0,1.0


# Label Encoder 
This is for output variables

In [22]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y_train)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

y_train

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0])