# Label Encoding 
Label Encoding is a technique that is used to convert categorical columns into numerical ones so that they can be fitted by machine learning models which only take numerical data. It is an important pre-processing step in a machine-learning project.

In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

## Example 1

In [2]:
# Creating a dummy dataframe
data = {
    'Screen': ['Big', 'Big', 'Medium', 'Small', 'Small'],
    'Model': ['iPhone 12', 'Galaxy S20', 'Pixel 5', 'iPhone 12', 'Galaxy S20'],
    'Defect': ['Yes', 'No', 'No', 'No', 'Yes']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Screen,Model,Defect
0,Big,iPhone 12,Yes
1,Big,Galaxy S20,No
2,Medium,Pixel 5,No
3,Small,iPhone 12,No
4,Small,Galaxy S20,Yes


In [3]:
df.Screen.replace({'Small':0,'Medium':1,'Big':2},inplace=True)
df

Unnamed: 0,Screen,Model,Defect
0,2,iPhone 12,Yes
1,2,Galaxy S20,No
2,1,Pixel 5,No
3,0,iPhone 12,No
4,0,Galaxy S20,Yes


In [4]:
df.Defect.replace({'Yes':0,'No':1},inplace =True)
df

Unnamed: 0,Screen,Model,Defect
0,2,iPhone 12,0
1,2,Galaxy S20,1
2,1,Pixel 5,1
3,0,iPhone 12,1
4,0,Galaxy S20,0


# One hot encoding

In [5]:
df = pd.get_dummies(df)
df

Unnamed: 0,Screen,Defect,Model_Galaxy S20,Model_Pixel 5,Model_iPhone 12
0,2,0,False,False,True
1,2,1,True,False,False
2,1,1,False,True,False
3,0,1,False,False,True
4,0,0,True,False,False


# Imbalance Data
 When categorical data is less there is less data to predict from the data

In [6]:
df.columns

Index(['Screen', 'Defect', 'Model_Galaxy S20', 'Model_Pixel 5',
       'Model_iPhone 12'],
      dtype='object')

In [7]:
y =df['Defect']
x =df[['Screen','Model_Galaxy S20', 'Model_Pixel 5',
       'Model_iPhone 12']]

## Oversampling

In [8]:
# Oversampling 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
x_ros ,y_ros = ros.fit_resample(x,y)

In [9]:
x.shape,x_ros.shape # Oversampled (7963 x 2 = 15926)

((5, 4), (6, 4))

In [10]:
y.value_counts(),y_ros.value_counts() # Oversampled

(Defect
 1    3
 0    2
 Name: count, dtype: int64,
 Defect
 0    3
 1    3
 Name: count, dtype: int64)

# Example 2

In [11]:
# Creating a dummy data frame
data ={'Name':['Ashish','Harsh','Hemant','Gaurav','Aman','Kushagra']}        
df1 = pd.DataFrame(data)
df1

Unnamed: 0,Name
0,Ashish
1,Harsh
2,Hemant
3,Gaurav
4,Aman
5,Kushagra


In [12]:
# using sklearn library 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['en_name']=le.fit_transform(df1['Name'])

In [13]:
df1

Unnamed: 0,Name,en_name
0,Ashish,1
1,Harsh,3
2,Hemant,4
3,Gaurav,2
4,Aman,0
5,Kushagra,5


 Note : The above encoding is not ordered encoding 

## Label encoding using Real world dataset

In [14]:
dataset = pd.read_csv('https://github.com/YBI-Foundation/Dataset/raw/main/Loan%20Eligibility%20Prediction.csv')
dataset.head()

Unnamed: 0,Customer_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,569,Female,No,0,Graduate,No,2378,0.0,9,360,1,Urban,N
1,15,Male,Yes,2,Graduate,No,1299,1086.0,17,120,1,Urban,Y
2,95,Male,No,0,Not Graduate,No,3620,0.0,25,120,1,Semiurban,Y
3,134,Male,Yes,0,Graduate,Yes,3459,0.0,25,120,1,Semiurban,Y
4,556,Male,Yes,1,Graduate,No,5468,1032.0,26,360,1,Semiurban,Y


In [15]:
le1 = LabelEncoder()
dataset['en_property_area'] = le1.fit_transform(dataset['Property_Area'])
dataset.head()

Unnamed: 0,Customer_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,en_property_area
0,569,Female,No,0,Graduate,No,2378,0.0,9,360,1,Urban,N,2
1,15,Male,Yes,2,Graduate,No,1299,1086.0,17,120,1,Urban,Y,2
2,95,Male,No,0,Not Graduate,No,3620,0.0,25,120,1,Semiurban,Y,1
3,134,Male,Yes,0,Graduate,Yes,3459,0.0,25,120,1,Semiurban,Y,1
4,556,Male,Yes,1,Graduate,No,5468,1032.0,26,360,1,Semiurban,Y,1


In [16]:
dataset['Property_Area'].value_counts()

Property_Area
Semiurban    233
Urban        202
Rural        179
Name: count, dtype: int64

In [17]:
dataset['en_property_area'].value_counts()

en_property_area
1    233
2    202
0    179
Name: count, dtype: int64

# Ordinal Encoding 
Ordinal encoding comes under label encoding but it assumes that categories in categorical variables have clear, natural, and intrinsic ordering to their categories.

It can be performed in 2 ways :
1. Using sklearn library
2. Using map functions

In [27]:
# creating a dataframe 
data = {'Size':['s','m','l','xl','m','s','xl','l','s','xl']}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,Size
0,s
1,m
2,l
3,xl
4,m
5,s
6,xl
7,l
8,s
9,xl


In [28]:
ord_data = [['s','m','l','xl']] 

## Using sklearn library 

In [29]:
from sklearn.preprocessing import OrdinalEncoder 
oe = OrdinalEncoder(categories=ord_data)

In [31]:
oe.fit(df2[["Size"]])

In [34]:
df2['en_Size']=oe.transform(df2[['Size']])

In [36]:
df2

Unnamed: 0,Size,en_Size
0,s,0.0
1,m,1.0
2,l,2.0
3,xl,3.0
4,m,1.0
5,s,0.0
6,xl,3.0
7,l,2.0
8,s,0.0
9,xl,3.0


## Using Map function

In [37]:
ord_data1 = {'s':0,'m':1,'l':2,'xl':3}

In [39]:
df2['en_Size_map'] = df2['Size'].map(ord_data1)

In [40]:
df2

Unnamed: 0,Size,en_Size,en_Size_map
0,s,0.0,0
1,m,1.0,1
2,l,2.0,2
3,xl,3.0,3
4,m,1.0,1
5,s,0.0,0
6,xl,3.0,3
7,l,2.0,2
8,s,0.0,0
9,xl,3.0,3


## Ordinal encoding using Real world dataset 

In [43]:
dataset1 = pd.read_csv('https://github.com/YBI-Foundation/Dataset/raw/main/Loan%20Eligibility%20Prediction.csv')
dataset1.head()

Unnamed: 0,Customer_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,569,Female,No,0,Graduate,No,2378,0.0,9,360,1,Urban,N
1,15,Male,Yes,2,Graduate,No,1299,1086.0,17,120,1,Urban,Y
2,95,Male,No,0,Not Graduate,No,3620,0.0,25,120,1,Semiurban,Y
3,134,Male,Yes,0,Graduate,Yes,3459,0.0,25,120,1,Semiurban,Y
4,556,Male,Yes,1,Graduate,No,5468,1032.0,26,360,1,Semiurban,Y


In [45]:
# ordinal encoding using map function 
dataset1['Property_Area'].value_counts()

Property_Area
Semiurban    233
Urban        202
Rural        179
Name: count, dtype: int64

In [46]:
sorted_data = {'Rural':0,'Semiurban':1,'Urban':2}

In [47]:
dataset1['encoded_property_area'] = dataset1['Property_Area'].map(sorted_data)

In [48]:
dataset1.head()

Unnamed: 0,Customer_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,encoded_property_area
0,569,Female,No,0,Graduate,No,2378,0.0,9,360,1,Urban,N,2
1,15,Male,Yes,2,Graduate,No,1299,1086.0,17,120,1,Urban,Y,2
2,95,Male,No,0,Not Graduate,No,3620,0.0,25,120,1,Semiurban,Y,1
3,134,Male,Yes,0,Graduate,Yes,3459,0.0,25,120,1,Semiurban,Y,1
4,556,Male,Yes,1,Graduate,No,5468,1032.0,26,360,1,Semiurban,Y,1
