In [6]:
# Data Encoding in Machine Learning preprocessing is a crucial step that converts categorical data into numerical format that ML algorithms can understand. Here are the main types:

# 1. Label Encoding:
# - Converts categorical values into numerical labels (0,1,2...)
# - Best for ordinal data (categories with order)
# - Example: Small→0, Medium→1, Large→2

# 2. One-Hot Encoding:
# - Creates binary columns for each category
# - Best for nominal data (no order) example single married seperated there is no order no one is better than each other hence no order in data
# - Example: Color(Red,Blue) → [1,0], [0,1]

# 3. Binary Encoding:
# - Converts categories into binary digits
# - Useful for high cardinality data
# - Uses fewer dimensions than one-hot

# 4. Ordinal Encoding: 
# - Similar to label encoding but preserves order
# - Used when categories have meaningful sequence
# - Example: Bad→0, Good→1, Excellent→2

# The choice of encoding method depends on:
# - Data type (ordinal vs nominal)
# - Number of unique categories
# - Algorithm requirements

In [2]:
# Nominal OHE >> Binary vector for each categories  
# example single married seperated
#  single: [1,0,0]
# married : [0,1,0]
# seperated: [0,0,1]

In [82]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
df = pd.DataFrame ({ "status":["single" , "married" , "single" , "seperated" ,"single" , "married" , "single" , "seperated" ]})
df

Unnamed: 0,status
0,single
1,married
2,single
3,seperated
4,single
5,married
6,single
7,seperated


In [84]:
encoder = OneHotEncoder()

In [86]:
encoded = encoder.fit_transform(df[["status"]]).toarray()

In [88]:
encoded # The above given data was hence encoded for the ML preprocessing using One Hot Encoding Technique

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [30]:
encoder.get_feature_names_out()

array(['status_married', 'status_seperated', 'status_single'],
      dtype=object)

In [36]:
encoded_df = pd.DataFrame(encoded , columns=encoder.get_feature_names_out()) 

In [38]:
encoded_df

Unnamed: 0,status_married,status_seperated,status_single
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0
6,0.0,0.0,1.0
7,0.0,1.0,0.0


In [34]:
# The above is the encoded data for ML use.

In [90]:
 pd.concat([df , encoded_df] , axis =1)

Unnamed: 0,status,status_married,status_seperated,status_single
0,single,0.0,0.0,1.0
1,married,1.0,0.0,0.0
2,single,0.0,0.0,1.0
3,seperated,0.0,1.0,0.0
4,single,0.0,0.0,1.0
5,married,1.0,0.0,0.0
6,single,0.0,0.0,1.0
7,seperated,0.0,1.0,0.0


In [50]:
# The above is the final data frame after encoding the data.

In [66]:
import seaborn as sns 

df1 = sns.load_dataset("tips")
df1.reset_index(drop=True, inplace=True)
df1

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [70]:
encoded_days = encoder.fit_transform(df1[["day"]]).toarray()

In [72]:
encoded_days

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [74]:
encoded_days_df = pd.DataFrame(encoded_days , columns = encoder.get_feature_names_out())

In [76]:
encoded_days_df

Unnamed: 0,day_Fri,day_Sat,day_Sun,day_Thur
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
239,0.0,1.0,0.0,0.0
240,0.0,1.0,0.0,0.0
241,0.0,1.0,0.0,0.0
242,0.0,1.0,0.0,0.0


In [78]:
# Label encoding >> assign unique label to the  categories.

In [92]:
df

Unnamed: 0,status
0,single
1,married
2,single
3,seperated
4,single
5,married
6,single
7,seperated


In [94]:
from sklearn.preprocessing import LabelEncoder
labeler = LabelEncoder()

In [110]:
labeler.fit_transform(df[['status']])

  y = column_or_1d(y, warn=True)


array([2, 0, 2, 1, 2, 0, 2, 1])

In [114]:
labeler.fit_transform([['single']])

  y = column_or_1d(y, warn=True)


array([0])

In [136]:
# ordinal encoding >>
# high school : 1
# graduation : 2
# phd : 3

from sklearn.preprocessing import OrdinalEncoder

df5 = pd.DataFrame({"qualifications" :['High_School','PHD','High_School','Graduate','PHD','High_School','PHD','High_School','Graduate']})

In [138]:
df5

Unnamed: 0,qualifications
0,High_School
1,PHD
2,High_School
3,Graduate
4,PHD
5,High_School
6,PHD
7,High_School
8,Graduate


In [140]:
ordinal = OrdinalEncoder(categories =[['High_School', 'Graduate' , 'PHD']])

In [146]:
ordinal.fit_transform(df5[['qualifications']])

array([[0.],
       [2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [1.]])

In [157]:
# Target Guided ordinal encoding
# Used when a lot of categories are there
# Show relationship between target variable
# Replace the category with mean and median of respective group

In [163]:
df = pd.DataFrame({'time':['lunch','breakfast','dinner' , 'dinner'] , 'total_bill':[100 ,200 ,300 ,500]})

In [165]:
df

Unnamed: 0,time,total_bill
0,lunch,100
1,breakfast,200
2,dinner,300
3,dinner,500


In [169]:
mean_price = df.groupby('time')['total_bill'].mean().to_dict()

In [186]:
df['time'].map(mean_price)

0    100.0
1    200.0
2    400.0
3    400.0
Name: time, dtype: float64

In [188]:
df["time_encoded"] = df['time'].map(mean_price)

In [190]:
df

Unnamed: 0,time,total_bill,time_encoded
0,lunch,100,100.0
1,breakfast,200,200.0
2,dinner,300,400.0
3,dinner,500,400.0
