In [1]:
# import numpy for numerical operations
import numpy as np

# import pandas for data manipulation
import pandas as pd

In [2]:
# load the customer dataset from a csv file
df = pd.read_csv('customer.csv')

In [3]:
# display 5 random rows from the dataframe
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
15,75,Male,Poor,UG,No
39,76,Male,Poor,PG,No
44,77,Female,Average,UG,No
0,30,Female,Average,School,No
2,70,Female,Good,PG,No


Categorical Values:

  | Column | Types | Category | Encoder |
  |---|---|---|---|
  | Gender | Nominal | Male, Female | OneHotEncoder |
  | Review | Ordinal | Good, Average, Poor | OrdinalEncoder |
  | Education | Ordinal | School, UG, PG | OrdinalEncoder |
  | Purchased(OUtput) | Nominal | Yes, No | LabelEncoder |

In [4]:
# drop the first two columns from the dataframe
df = df.iloc[:, 2:]

In [5]:
# display the first 10 rows of the dataframe
df.head(10)

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No
5,Average,School,Yes
6,Good,School,No
7,Poor,School,Yes
8,Average,UG,No
9,Good,UG,Yes


## **Train Test Split**

In [6]:
# import train_test_split function
from sklearn.model_selection import train_test_split

# split input features (columns 0 and 1) and target (last column)
# 80% data for training, 20% for testing
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0:2],    # selecting first two columns as features
    df.iloc[:, -1],     # selecting last column as target
    test_size=0.2      # 20% data used for testing
)

In [7]:
# display the training feature set
x_train

Unnamed: 0,review,education
33,Good,PG
31,Poor,School
44,Average,UG
19,Poor,PG
1,Poor,UG
13,Average,School
18,Good,School
25,Good,School
15,Poor,UG
30,Average,UG


In [8]:
# check the shape (rows, columns) of training and testing sets
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((40, 2), (10, 2), (40,), (10,))

## **Ordinal Encoding**

In [9]:
# import ordinalencoder to convert categorical features into numerical values
from sklearn.preprocessing import OrdinalEncoder

In [10]:
# create ordinalencoder with a custom order for two categorical columns
oe = OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

In [11]:
# fit the encoder on the training data
oe.fit(x_train)

In [12]:
# apply ordinal encoding to the training features
x_train = oe.transform(x_train)

# apply the same encoding to the testing features
x_test = oe.transform(x_test)

In [13]:
# display the encoded training feature data
x_train

array([[2., 2.],
       [0., 0.],
       [1., 1.],
       [0., 2.],
       [0., 1.],
       [1., 0.],
       [2., 0.],
       [2., 0.],
       [0., 1.],
       [1., 1.],
       [0., 0.],
       [2., 2.],
       [2., 0.],
       [0., 0.],
       [2., 1.],
       [0., 2.],
       [2., 0.],
       [2., 0.],
       [2., 1.],
       [1., 2.],
       [2., 2.],
       [2., 1.],
       [1., 0.],
       [0., 2.],
       [2., 1.],
       [1., 0.],
       [0., 0.],
       [1., 2.],
       [1., 2.],
       [1., 1.],
       [2., 2.],
       [0., 0.],
       [2., 1.],
       [2., 0.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [0., 2.],
       [2., 1.],
       [1., 0.]])

In [14]:
# check the categories used in the encoder
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

## **Label Encoding**

In [15]:
# import labelencoder to convert target labels into numeric form
from sklearn.preprocessing import LabelEncoder

In [16]:
# create a labelencoder instance
le = LabelEncoder()

In [17]:
# fit the label encoder on the training labels
le.fit(y_train)

In [18]:
# display the class labels learned by the encoder
le.classes_

array(['No', 'Yes'], dtype=object)

In [19]:
# encode training labels as integers
y_train = le.transform(y_train)

# encode testing labels using the same mapping
y_test = le.transform(y_test)

In [20]:
# display the encoded training labels
y_train

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1])