In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("mercedes.csv")

In [3]:
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


Taking first six columns in the dataframe for one-hot encoding

In [4]:
new_columns = df.columns[2:8]
df_edit = df[new_columns]

In [5]:
df_edit.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5
0,k,v,at,a,d,u
1,k,t,av,e,d,y
2,az,w,n,c,d,x
3,az,t,n,f,d,x
4,az,v,n,f,d,h


Getting all the necessary statistics for the categorical columns

In [6]:
print("Following is the list of columns and the number of unique values present in them")
for col in df_edit.columns:
    print(f'{col} : {len(df_edit[col].unique())}')

Following is the list of columns and the number of unique values present in them
X0 : 47
X1 : 27
X2 : 44
X3 : 7
X4 : 4
X5 : 29


for One-hot encoding we can use either pd.dummies() method of the pandas library or use sklearn.preprocessing's OneHotEncoder() class

One-hot encoding using pd.get_dummies() method

In [7]:
pd.get_dummies(df_edit, drop_first=True).astype('int64')

Unnamed: 0,X0_aa,X0_ab,X0_ac,X0_ad,X0_af,X0_ai,X0_aj,X0_ak,X0_al,X0_am,...,X5_o,X5_p,X5_q,X5_r,X5_s,X5_u,X5_v,X5_w,X5_x,X5_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4206,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4207,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


One-hot encoding using sklearn.preprocessing library

**sparse_output** parameter is by default set to **True** and returns a sparse matrix of type "scipy.sparse.csr_matrix". A sparse matrix is a matrix that stores only non-zero elements and their positions within a dense matrix. It provides computational efficiency and saves memory. Upon performing one-hot encoding a categorical column that has a large number of categories (eg. 100,000), doing text vectorization, or mapping user-item interactions, produces excessive number of columns or matrices with many zeroes, thus using a sparse matrix in large datasets or pipelines that accept sparse input can be an efficient option. Therefore, **sparse_output** parameter can be set to do **False**, if we want to EDA or integrate with a regular pandas dataFrame.

In [13]:
from sklearn.preprocessing import OneHotEncoder
df_ohe = OneHotEncoder(sparse_output=False).fit_transform(df_edit)
df_ohe = pd.DataFrame(df_ohe)

In [14]:
df_ohe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,148,149,150,151,152,153,154,155,156,157
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### One-hot encoding could be a one of the methods to do feature engineering on nominal variables or variables where categories do not have an inherent order
--- 

### Ordinal Encoding

#### Encoding a dataset containing dates and day names of the next 30 days 

In [20]:
import datetime as dt

In [44]:
today = dt.datetime.strptime(
    dt.datetime.strftime(
        dt.datetime.today(), 
        format = "%Y-%m-%d %H%M"), "%Y-%m-%d %H%M")

list_of_next_30_days = [today+dt.timedelta(days = i) for i in range(1,31)]
list_of_day_names = [date.strftime('%A').lower() for date in list_of_next_30_days]

date_df = pd.DataFrame(data = {'date':list_of_next_30_days,
                               'day':list_of_day_names})


In [38]:
date_df.head()

Unnamed: 0,date,day
0,2025-06-06 18:42:00,friday
1,2025-06-07 18:42:00,saturday
2,2025-06-08 18:42:00,sunday
3,2025-06-09 18:42:00,monday
4,2025-06-10 18:42:00,tuesday


In [None]:
date_df['day_ordinal'] = date_df['day'].map({
                                        'monday' : 1,
                                        'tuesday' : 2,
                                        'wednesday' : 3,
                                        'thursday' : 4,
                                        'friday' : 5,
                                        'saturday' : 6,
                                        'sunday' : 7
                                    })

In [46]:
date_df.head()

Unnamed: 0,date,day,day_ordinal
0,2025-06-06 18:45:00,friday,5
1,2025-06-07 18:45:00,saturday,6
2,2025-06-08 18:45:00,sunday,7
3,2025-06-09 18:45:00,monday,1
4,2025-06-10 18:45:00,tuesday,2


### Frequency Encoding

In [50]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' , header = None,index_col=None)

In [51]:
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
