In [22]:
import pandas as pd

df = pd.read_csv("dataset.csv")

## Quick Data Check

- Confirm schema, missing values, and initial records before encoding
- Keeps transformations aligned with the current dataset state

In [23]:
df.info()
print("\nMissing values per column:")
print(df.isna().sum())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               15 non-null     int64 
 1   City             15 non-null     object
 2   Color            15 non-null     object
 3   Education        15 non-null     object
 4   Neighborhood     15 non-null     object
 5   ProductCategory  15 non-null     object
 6   Channel          15 non-null     object
 7   Purchased        15 non-null     object
dtypes: int64(1), object(7)
memory usage: 1.1+ KB

Missing values per column:
ID                 0
City               0
Color              0
Education          0
Neighborhood       0
ProductCategory    0
Channel            0
Purchased          0
dtype: int64


Unnamed: 0,ID,City,Color,Education,Neighborhood,ProductCategory,Channel,Purchased
0,1,Delhi,Red,High School,ZoneA,Electronics,Online,Yes
1,2,Mumbai,Blue,Bachelors,ZoneC,Groceries,Retail,No
2,3,Bangalore,Green,Masters,ZoneB,Clothing,Online,Yes
3,4,Delhi,Blue,PhD,ZoneA,Furniture,Agent,No
4,5,Chennai,Red,Bachelors,ZoneD,Electronics,Online,Yes


# Label Encoding

- Maps each class label to a unique integer
- Keeps ordering implicit, so restrict to encoding target labels
- Simple baseline when target is categorical

In [24]:
from sklearn.preprocessing import LabelEncoder

df_label = df.copy()
le = LabelEncoder()
df_label["Purchased_encoded"] = le.fit_transform(df_label["Purchased"])

df_label[["Purchased", "Purchased_encoded"]].head()

Unnamed: 0,Purchased,Purchased_encoded
0,Yes,1
1,No,0
2,Yes,1
3,No,0
4,Yes,1


# One-Hot Encoding

- Expands nominal features into binary indicator columns
- Avoids implicit ordering and suits tree-based or linear models
- Drop one column per feature to prevent the dummy variable trap
- Handle unknown categories at inference to avoid errors

In [25]:
from sklearn.preprocessing import OneHotEncoder

df_ohe = df.copy()
ohe = OneHotEncoder(sparse_output=False, drop="first", handle_unknown="ignore")
encoded_array = ohe.fit_transform(df_ohe[["Color", "City"]])
encoded_df = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out(["Color", "City"]))
df_ohe = pd.concat([df_ohe, encoded_df], axis=1)

df_ohe.head()

Unnamed: 0,ID,City,Color,Education,Neighborhood,ProductCategory,Channel,Purchased,Color_Green,Color_Red,Color_Yellow,City_Bangalore,City_Chennai,City_Delhi,City_Hyderabad,City_Jaipur,City_Kolkata,City_Mumbai,City_Pune,City_Surat
0,1,Delhi,Red,High School,ZoneA,Electronics,Online,Yes,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Mumbai,Blue,Bachelors,ZoneC,Groceries,Retail,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3,Bangalore,Green,Masters,ZoneB,Clothing,Online,Yes,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Delhi,Blue,PhD,ZoneA,Furniture,Agent,No,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Chennai,Red,Bachelors,ZoneD,Electronics,Online,Yes,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Ordinal Encoding

- Applies when categories have a meaningful order
- Requires defining the explicit progression of categories
- Preserves rank information for linear and tree-based models

In [26]:
from sklearn.preprocessing import OrdinalEncoder

df_ordinal = df.copy()
education_order = [["High School", "Bachelors", "Masters", "PhD"]]
oe = OrdinalEncoder(categories=education_order)
df_ordinal["Education_ordinal"] = oe.fit_transform(df_ordinal[["Education"]])

df_ordinal[["Education", "Education_ordinal"]].head()

Unnamed: 0,Education,Education_ordinal
0,High School,0.0
1,Bachelors,1.0
2,Masters,2.0
3,PhD,3.0
4,Bachelors,1.0


# Target Encoding

- Replaces categories with the average target value within each group
- Compresses high-cardinality features into a single column
- Powerful but prone to overfitting when data is sparse
- Mitigate leakage with cross-validation or smoothing

In [27]:
target_map = {"Yes": 1, "No": 0}
df_target = df.copy()
df_target["Purchased_binary"] = df_target["Purchased"].map(target_map)

neighborhood_target_mean = df_target.groupby("Neighborhood")["Purchased_binary"].mean()
df_target["Neighborhood_target_encoded"] = df_target["Neighborhood"].map(neighborhood_target_mean)

df_target[["Neighborhood", "Neighborhood_target_encoded"]].drop_duplicates().sort_values("Neighborhood")

Unnamed: 0,Neighborhood,Neighborhood_target_encoded
0,ZoneA,0.666667
2,ZoneB,1.0
1,ZoneC,0.0
4,ZoneD,0.5
8,ZoneE,0.5
9,ZoneF,0.0
11,ZoneG,1.0
12,ZoneH,0.0
13,ZoneI,1.0
14,ZoneJ,0.0
