# Data Encoding

1. Nominal/OHE Encoding
2. Label and Ordinal Encoding
3. Target Guided Ordinal Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
#Create a simple dataframe
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'green', 'red', 'blue' ]
})

In [3]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [4]:
#create an instance of OneHOtEncoder
encoder = OneHotEncoder()

In [10]:
encoded = encoder.fit_transform(df[['color']])

In [11]:
type(encoded)

scipy.sparse._csr.csr_matrix

In [14]:
encoded = encoded.toarray()

In [15]:
encoded

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [16]:
import pandas as pd
encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

In [17]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [18]:
encoder_df.T

Unnamed: 0,0,1,2,3,4,5
color_blue,0.0,1.0,0.0,0.0,0.0,1.0
color_green,0.0,0.0,1.0,1.0,0.0,0.0
color_red,1.0,0.0,0.0,0.0,1.0,0.0


In [19]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [20]:
## for new data
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [22]:
# to avoid warnings
import warnings
warnings.filterwarnings('ignore')

In [23]:
encoder.transform([['blue']]).toarray()

array([[1., 0., 0.]])

In [25]:
#concat the original and encoded df
pd.concat([df, encoder_df], axis = 1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


# One Hot Encoding Practice

In [29]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [30]:
from sklearn.preprocessing import OneHotEncoder

In [31]:
#create an instance of the class
my_encoder = OneHotEncoder()

In [68]:
my_encoded_sex = my_encoder.fit_transform(df[['sex']])
my_encoded_smoker = my_encoder.fit_transform(df[['smoker']])
my_encoded_day = my_encoder.fit_transform(df[['day']])
my_encoded_time = my_encoder.fit_transform(df[['time']])

In [71]:
my_encoded_day_array = my_encoded_day.toarray()

In [89]:
# my_encoded_day_array

In [73]:
my_encoded_sex_array = my_encoded_sex.toarray()
my_encoded_day_array = my_encoded_day.toarray()
my_encoded_time_array = my_encoded_time.toarray()

In [88]:
# my_encoded_day_array

In [86]:
import pandas as pd
# my_encoded_sex_df = pd.DataFrame(my_encoded_sex_array, columns=my_encoder.get_feature_names_out())
my_encoded_day_df = pd.DataFrame(my_encoded_day_array, columns=my_encoder.get_feature_names_out())
# my_encoded_time_df = pd.DataFrame(my_encoded_time_array, columns=my_encoder.get_feature_names_out())

ValueError: Shape of passed values is (244, 4), indices imply (244, 2)

In [83]:
# my_encoded_sex_df

In [85]:
my_encoded_day_df

NameError: name 'my_encoded_day_df' is not defined

In [44]:
type(my_encoded_data)  # This gives a sparse matrix

scipy.sparse._csr.csr_matrix

In [48]:
my_encoded_data.toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [59]:
#convert to data frame
import pandas as pd
my_encoded_sex_df = pd.DataFrame(my_encoded_sex, columns = encoder.get_feature_names_out())

ValueError: Shape of passed values is (244, 1), indices imply (244, 3)

# Label Encoding

Label encoding and ordinal encoding are two techniques used to encode categorical data as numerical data.

Label encoding involves assigning a unique label to each category in the variable. The labels are usually assigned in alphabetical order or basedf on the frequency of the categories.

In [92]:
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'green', 'red']
})

In [93]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [95]:
from sklearn.preprocessing import LabelEncoder

#create an instance for the LabelEncoder class
lbl_encoder = LabelEncoder()

In [97]:
lbl_encoder.fit_transform(df['color'])

array([2, 0, 1, 1, 2])

In [102]:
## for new data or unseen data
lbl_encoder.transform([['red']])

array([2])

In [103]:
lbl_encoder.transform([['blue']])

array([0])

In [104]:
lbl_encoder.transform([['green']])

array([1])

# Ordinal Encoding

It is used to encode categorical data that have an intrinsic order or ranking. In this technique, each category is assigned a numerical value based on its position in the order.

- Example:
If we have a categorical variable, "education level" with four possible values(high school, college, graduate,  post-graduate), we can represent it using ordinal encoding as follows:

In [114]:
#Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder

In [115]:
#create an instance of the class
ord_encoder = OrdinalEncoder(categories = [['small', 'medium', 'large']])

In [116]:
#create a simple dataframe
df = pd.DataFrame({
    'size': ['small', 'medium', 'large', 'medium', 'small', 'large']
})

In [117]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [119]:
ord_encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [120]:
#for new data or unseen data
ord_encoder.transform([['large']])

array([[2.]])

In [121]:
ord_encoder.transform([['medium']])

array([[1.]])

In [122]:
ord_encoder.transform([['small']])

array([[0.]])

In [3]:
!jt -t chesterish -T -N -kl