In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('customer.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


# Note:

***Ordinal Encoding on Categorical Data***

We will use **Ordinal Encoding** on categorical data. There are two types of categorical data:

### a. Nominal Data  
Just labels or names without any order.  
**Example:** Colors (Red, Blue, Green), Gender (Male, Female).

### b. Ordinal Data  
Has a meaningful order or ranking, but the differences between values are not necessarily equal.  
**Example:** Education Level (High School, Bachelor’s, Master’s), Customer Satisfaction (Low, Medium, High).

- Ordinal encoding is used on Ordinal data. So we need to remove the numerical data columns and nominal data columns from the dataset.

In [5]:
df = df.iloc[:,2:] # WE removed the first two columns as they are numerical and nominal

In [45]:
df.head(30)

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No
5,Average,School,Yes
6,Good,School,No
7,Poor,School,Yes
8,Average,UG,No
9,Good,UG,Yes


👇 এই কোডটি scikit-learn লাইব্রেরির train_test_split ফাংশন ব্যবহার করে ডেটাসেটকে ট্রেনিং এবং টেস্ট সেটে ভাগ করছে।

ব্যাখ্যা:
লাইব্রেরি ইমপোর্ট
from sklearn.model_selection import train_test_split
→ train_test_split ফাংশনটি ইমপোর্ট করা হয়েছে, যা ডেটাসেটকে ট্রেন ও টেস্ট সেটে বিভক্ত করতে ব্যবহৃত হয়।

ফিচার (X) এবং লেবেল (y) আলাদা করা
df.iloc[:, 0:2]
→ df ডেটাফ্রেমের প্রথম দুই কলাম (ইনডেক্স 0 এবং 1) X (ফিচার) হিসেবে নেয়া হয়েছে।
df.iloc[:, -1]
→ শেষ কলাম (ইনডেক্স -1) y (লেবেল) হিসেবে নেয়া হয়েছে।

ডেটাসেট বিভক্ত করা
train_test_split(df.iloc[:, 0:2], df.iloc[:, -1], test_size=0.2)
→ এখানে ডেটাসেটকে ৮০% ট্রেনিং ও ২০% টেস্ট সেটে ভাগ করা হয়েছে।

ভেরিয়েবলগুলোতে সংরক্ষণ
X_train, X_test, y_train, y_test = train_test_split(...)
→

X_train = ট্রেনিং ডেটার ফিচার
X_test = টেস্ট ডেটার ফিচার
y_train = ট্রেনিং ডেটার লেবেল
y_test = টেস্ট ডেটার লেবেল
সংক্ষেপে:
এই কোড ডেটাসেটকে ট্রেন ও টেস্ট সেটে ভাগ করে, যেখানে ফিচার (X) এবং লেবেল (y) আলাদা করা হয় এবং ৮০% ট্রেনিং, ২০% টেস্ট হিসেবে ভাগ করা হয়।

In [8]:
from sklearn.model_selection import train_test_split 
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['purchased']),
                                                df['purchased'],
                                                test_size=0.2)


In [9]:
X_train

Unnamed: 0,review,education
28,Poor,School
35,Poor,School
16,Poor,UG
43,Poor,PG
46,Poor,PG
6,Good,School
15,Poor,UG
19,Poor,PG
37,Average,PG
21,Average,PG


In [10]:
from sklearn.preprocessing import OrdinalEncoder

In [11]:
oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']]) 

# Category te vag kore dilam.. ekhane poor er level low,
# good er level high. same for school as low level and PG as high level

In [12]:
oe.fit(X_train)

In [13]:
X_train = oe.transform(X_train)
X_test = oe.transform(X_test)

In [14]:
X_train

array([[0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 2.],
       [0., 2.],
       [2., 0.],
       [0., 1.],
       [0., 2.],
       [1., 2.],
       [1., 2.],
       [0., 2.],
       [1., 1.],
       [1., 0.],
       [2., 2.],
       [1., 2.],
       [0., 0.],
       [1., 0.],
       [2., 1.],
       [0., 2.],
       [2., 2.],
       [2., 2.],
       [1., 0.],
       [0., 0.],
       [1., 1.],
       [2., 0.],
       [0., 2.],
       [1., 1.],
       [2., 2.],
       [1., 0.],
       [0., 2.],
       [1., 1.],
       [1., 1.],
       [0., 1.],
       [2., 0.],
       [0., 2.],
       [2., 1.],
       [2., 2.],
       [2., 1.],
       [1., 1.],
       [0., 1.]])

In [15]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

### We need to label encode the output column

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
le = LabelEncoder() # ekhane kono parameter nai.

In [19]:
le.fit(y_train)

In [20]:
le.classes_ # eta diye dekhlam yes and no er kon value pawa gelo.. NO er jonno 0 and yes er jonno 1

array(['No', 'Yes'], dtype=object)

In [21]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [41]:
y_train

array([0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0])