# **One Hoy Encoding**

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("../files/cars.csv")

In [3]:
data.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
data.shape, data['brand'].nunique()

((8128, 5), 32)

**OHE using Pandas**

In [5]:
df = pd.get_dummies(data=data, columns=["fuel", "owner"])
df.head()

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False


**K-1 OHE**

In [6]:
df = pd.get_dummies(data=data, columns=["fuel", "owner"], drop_first= True)
df.head()

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False


*Pandas `get_dummies()` is quick for one-hot encoding but lacks robustness for ML, as it can't handle unseen categories and may create inconsistent column orders. In contrast, Scikit-learn’s `OneHotEncoder` ensures stability, handles unknown values, and optimizes memory, making it the better choice for ML pipelines.*

**OHE USING SCIKIT LEARN**

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    data.iloc[:, 0:4],
    data.iloc[:, -1],
    test_size=0.2,
    random_state=5,
)

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

In [11]:
X_train_en = encoder.fit_transform(X_train[["fuel", "owner"]])
X_test_en = encoder.fit_transform(X_test[["fuel", "owner"]])

In [12]:
np.hstack((X_train[["brand", "km_driven"]].values, X_train_en))

array([['Tata', 100000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Mahindra', 200000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Skoda', 11000, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       ['Maruti', 35000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 120000, 0.0, ..., 1.0, 0.0, 0.0],
       ['Maruti', 52000, 1.0, ..., 0.0, 0.0, 0.0]],
      shape=(6502, 9), dtype=object)