In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/customer.csv')

In [3]:
df.head()


Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [4]:
df.isnull().sum()

age          0
gender       0
review       0
education    0
purchased    0
dtype: int64

# Ordinal Encoding

In [5]:
df[['review', 'education']]


Unnamed: 0,review,education
0,Average,School
1,Poor,UG
2,Good,PG
3,Good,PG
4,Average,UG
5,Average,School
6,Good,School
7,Poor,School
8,Average,UG
9,Good,UG


In [6]:
df['review'].unique()

array(['Average', 'Poor', 'Good'], dtype=object)

In [7]:
df['education'].unique()

array(['School', 'UG', 'PG'], dtype=object)

In [8]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])
df[['review', 'education']]= oe.fit_transform(df[['review', 'education']])

In [9]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [10]:
df[['review', 'education']]

Unnamed: 0,review,education
0,1.0,0.0
1,0.0,1.0
2,2.0,2.0
3,2.0,2.0
4,1.0,1.0
5,1.0,0.0
6,2.0,0.0
7,0.0,0.0
8,1.0,1.0
9,2.0,1.0


In [11]:
df

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,1.0,0.0,No
1,68,Female,0.0,1.0,No
2,70,Female,2.0,2.0,No
3,72,Female,2.0,2.0,No
4,16,Female,1.0,1.0,No
5,31,Female,1.0,0.0,Yes
6,18,Male,2.0,0.0,No
7,60,Female,0.0,0.0,Yes
8,65,Female,1.0,1.0,No
9,74,Male,2.0,1.0,Yes


# Label Encoding

In [12]:
#label encoding is used for output column only

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['purchased']= le.fit_transform(df['purchased'])

In [13]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [14]:
df

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,1.0,0.0,0
1,68,Female,0.0,1.0,0
2,70,Female,2.0,2.0,0
3,72,Female,2.0,2.0,0
4,16,Female,1.0,1.0,0
5,31,Female,1.0,0.0,1
6,18,Male,2.0,0.0,0
7,60,Female,0.0,0.0,1
8,65,Female,1.0,1.0,0
9,74,Male,2.0,1.0,1


# One hot Encoding

In [15]:
df['gender'].value_counts()


Female    29
Male      21
Name: gender, dtype: int64

In [16]:
#using dummy variable (pandas)
dummies= pd.get_dummies(df, columns = ['gender'])
dummies

Unnamed: 0,age,review,education,purchased,gender_Female,gender_Male
0,30,1.0,0.0,0,1,0
1,68,0.0,1.0,0,1,0
2,70,2.0,2.0,0,1,0
3,72,2.0,2.0,0,1,0
4,16,1.0,1.0,0,1,0
5,31,1.0,0.0,1,1,0
6,18,2.0,0.0,0,0,1
7,60,0.0,0.0,1,1,0
8,65,1.0,1.0,0,1,0
9,74,2.0,1.0,1,0,1


In [17]:
df

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,1.0,0.0,0
1,68,Female,0.0,1.0,0
2,70,Female,2.0,2.0,0
3,72,Female,2.0,2.0,0
4,16,Female,1.0,1.0,0
5,31,Female,1.0,0.0,1
6,18,Male,2.0,0.0,0
7,60,Female,0.0,0.0,1
8,65,Female,1.0,1.0,0
9,74,Male,2.0,1.0,1


In [18]:
#using OneHotEncoder

In [19]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)
df[['gender']]=ohe.fit_transform(df[['gender']])



In [None]:
df

Unnamed: 0,age,gender,review,education,purchased
0,30,0,1.0,0.0,0
1,68,0,0.0,1.0,0
2,70,0,2.0,2.0,0
3,72,0,2.0,2.0,0
4,16,0,1.0,1.0,0
5,31,0,1.0,0.0,1
6,18,1,2.0,0.0,0
7,60,0,0.0,0.0,1
8,65,0,1.0,1.0,0
9,74,1,2.0,1.0,1


# Column Transform

In [21]:
df = pd.read_csv('/content/customer.csv')

In [22]:
df

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No
5,31,Female,Average,School,Yes
6,18,Male,Good,School,No
7,60,Female,Poor,School,Yes
8,65,Female,Average,UG,No
9,74,Male,Good,UG,Yes


In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['purchased']),df['purchased'],
                                                test_size=0.2)

In [28]:
df


Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No
5,31,Female,Average,School,Yes
6,18,Male,Good,School,No
7,60,Female,Poor,School,Yes
8,65,Female,Average,UG,No
9,74,Male,Good,UG,Yes


In [24]:
from sklearn.compose import ColumnTransformer

In [25]:
tf=  ColumnTransformer([
    ('tf1',OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']]),  ['review', 'education']),
    ('tf2',OneHotEncoder(drop='first',sparse=False), ['gender'])
],remainder='passthrough')
tf

In [26]:
tf.fit_transform(X_train).shape




(40, 4)

In [27]:
tf.transform(X_test).shape

(10, 4)