### Data Encoding

#### 1. Nominal / One Hot Encoding

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Create simple dataframe
df = pd.DataFrame({
    'colors': ['red','blue','green','green','red','blue']
})

In [4]:
df

Unnamed: 0,colors
0,red
1,blue
2,green
3,green
4,red
5,blue


In [5]:
# Create instance of OneHotEncoder class
encoder = OneHotEncoder()

In [11]:
encoded = encoder.fit_transform(df[['colors']]).toarray()

In [12]:
# As you can see array is sorted according to alphabet
encoded

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [17]:
encoded_df = pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

In [18]:
encoded_df

Unnamed: 0,colors_blue,colors_green,colors_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [20]:
# Whenever you get new data 
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [21]:
pd.concat([df,encoded_df],axis=1)

Unnamed: 0,colors,colors_blue,colors_green,colors_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


In [22]:
import seaborn as sns
df = sns.load_dataset('tips')

In [24]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [28]:
encoded = encoder.fit_transform(df[['sex','smoker','day','time']]).toarray()

In [30]:
encoded_df = pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

In [32]:
encoded_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [33]:
pd.concat([df[['sex','smoker','day','time']],encoded_df],axis=1)

Unnamed: 0,sex,smoker,day,time,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,Female,No,Sun,Dinner,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,Male,No,Sun,Dinner,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,Male,No,Sun,Dinner,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,Male,No,Sun,Dinner,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,Female,No,Sun,Dinner,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,Male,No,Sat,Dinner,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,Female,Yes,Sat,Dinner,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,Male,Yes,Sat,Dinner,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,Male,No,Sat,Dinner,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


#### 2. Label Encoding / Ordinal Encoding

In [34]:
df = pd.DataFrame({
    'colors': ['red','blue','green','green','red','blue']
})

In [35]:
df

Unnamed: 0,colors
0,red
1,blue
2,green
3,green
4,red
5,blue


In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
encoder = LabelEncoder()

In [39]:
lbl_encoded = encoder.fit_transform(df[['colors']])

  y = column_or_1d(y, warn=True)


In [40]:
lbl_encoded

array([2, 0, 1, 1, 2, 0])

In [41]:
encoder.transform([['red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

In [42]:
encoder.transform([['blue']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([0])

#### Ordinal encoding for ranks

In [44]:
from sklearn.preprocessing import OrdinalEncoder

In [46]:
df = pd.DataFrame({
    'size': ['small','medium','large','medium','small','large']
})

In [47]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [48]:
od_encoder = OrdinalEncoder(categories=[['small','medium','large']])

In [49]:
od_encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [50]:
od_encoder.transform([['small']])



array([[0.]])

#### 4. Target Guided Ordinal Encoding

In [51]:
df = pd.DataFrame({
    'city' : ['New York','London','Paris','Tokyo','New York','Paris'],
    'price' : [200,150,300,250,180,320]
})

In [52]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [55]:
mean_price = df.groupby('city')['price'].mean().to_dict()

In [58]:
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [59]:
df['city_encoded'] = df['city'].map(mean_price)

In [61]:
df[['price','city_encoded']]

Unnamed: 0,price,city_encoded
0,200,190.0
1,150,150.0
2,300,310.0
3,250,250.0
4,180,190.0
5,320,310.0


In [62]:
# Practice

In [63]:
df = sns.load_dataset('tips')

In [64]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [67]:
total_bill_mean = df.groupby(df['time'])['total_bill'].mean().to_dict()

In [69]:
df['time_encoded'] = df['time'].map(total_bill_mean)

In [70]:
df[['time_encoded','total_bill']]

Unnamed: 0,time_encoded,total_bill
0,20.797159,16.99
1,20.797159,10.34
2,20.797159,21.01
3,20.797159,23.68
4,20.797159,24.59
...,...,...
239,20.797159,29.03
240,20.797159,27.18
241,20.797159,22.67
242,20.797159,17.82
