# Data Encoding 
Used for converting Categorical features into Numerical values.

<b>1. Norminal/OHE Encoding</b>

Each category is represented as binary vector, where each bit corresponds to a unique category.


In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

  from pandas.core import (


In [5]:
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'green', 'blue', 'red']
})

In [6]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,blue


In [7]:
encoder = OneHotEncoder()

In [9]:
# Performing fit and transform
# Alphabetically sorted
encoded = encoder.fit_transform(df[['color']]).toarray()

In [12]:
encoder_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out())

In [13]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
5,0.0,0.0,1.0


In [18]:
pd.concat([df, encoder_df], axis = 1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,blue,1.0,0.0,0.0
5,red,0.0,0.0,1.0


<b>2. Label Encoding</b>

Assigning a unique numerical label to each category in the variable.

In [19]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()

In [20]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 1, 0, 2])

In [21]:
# Red has been assigned 2
lbl_encoder.transform([['red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

<b>3. Ordinal Encoding</b>

Used to encode categorical data that have an intrinsic order or ranking, based on its position in the order.

In [22]:
from sklearn.preprocessing import OrdinalEncoder

In [25]:
df = pd.DataFrame({
    'size' : ['small', 'medium', 'large', 'medium', 'small', 'large'] 
})

In [26]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [28]:
ord_encoder = OrdinalEncoder(categories = [['small', 'medium', 'large']])

In [29]:
ord_encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

Small got 0, Medium got 1 and Large got 2

<b>4. Target Guided Ordinal Encoding</b>

Used to encode categorical variables based on their relationship with the target variable.

In [34]:
df = pd.DataFrame({
    'city' : ['New York', 'London', 'Paris', 'Tokyo', 'London', 'Paris'],
    'price' : [200, 150, 300, 250, 180, 350]
})

In [35]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,London,180
5,Paris,350


In [37]:
mean_price = df.groupby('city')['price'].mean().to_dict()

In [38]:
mean_price

{'London': 165.0, 'New York': 200.0, 'Paris': 325.0, 'Tokyo': 250.0}

In [39]:
df['city_encoded'] = df['city'].map(mean_price)

In [40]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,200.0
1,London,150,165.0
2,Paris,300,325.0
3,Tokyo,250,250.0
4,London,180,165.0
5,Paris,350,325.0
