## Data Encoding

### 1. Nominal/OHE Encoding
### 2. Label and Ordinal Encoding
### 3. Target Guided Ordinal

# 1. Nominal/OHE Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
## Create a sample dataframe
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'red', 'blue']
})

In [3]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,red
4,blue


In [4]:
##create an instance of OneHotEncoder
encoder=OneHotEncoder()

In [7]:
## perform fit and transform
encoded=encoder.fit_transform(df[['color']]).toarray()

In [10]:
import pandas as pd
encoder_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

In [11]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0


In [13]:
# for new data
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [14]:
pd.concat([df,encoder_df],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,red,0.0,0.0,1.0
4,blue,1.0,0.0,0.0


In [15]:
import seaborn as sns
sns.load_dataset('tips')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


## Label Encoding

In [16]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,red
4,blue


In [17]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder=LabelEncoder()

In [18]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 2, 0])

In [19]:
lbl_encoder.transform([['red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

In [20]:
lbl_encoder.transform([['blue']])

array([0])

In [21]:
lbl_encoder.transform([['green']])

array([1])

### Ordinal Encoding

In [22]:
from sklearn.preprocessing import OrdinalEncoder

In [23]:
# create a sample dataframe with an ordainal variable
df = pd.DataFrame({
    'size': ['small', 'medium', 'large', 'medium', 'small', 'large']
})

In [24]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [25]:
## create an instance of OrdinalEncoder and then fit_transform
encoder=OrdinalEncoder(categories=[['small', 'medium', 'large']])

In [26]:
encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [27]:
encoder.transform([['small']])



array([[0.]])

## Target Guided ordinal Encoding

In [2]:
import pandas as pd

# create a sample dataframe with a categorical variable and a target variable.
df = pd.DataFrame({
    'city': ['New Yark', 'London', 'Paris', 'Tokyo', 'New Yark', 'Paris'],
    'price': [200, 150, 300, 250, 180, 320]
})

In [3]:
df

Unnamed: 0,city,price
0,New Yark,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New Yark,180
5,Paris,320


In [4]:
mean_price=df.groupby('city')['price'].mean().to_dict()

In [5]:
mean_price

{'London': 150.0, 'New Yark': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [6]:
df['city_encoded']=df['city'].map(mean_price)

In [7]:
df

Unnamed: 0,city,price,city_encoded
0,New Yark,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New Yark,180,190.0
5,Paris,320,310.0


In [8]:
df[['price', 'city_encoded']]

Unnamed: 0,price,city_encoded
0,200,190.0
1,150,150.0
2,300,310.0
3,250,250.0
4,180,190.0
5,320,310.0
