# One-Hot-Encoding

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("cars.csv")

In [4]:
df.head()
# Here the Nominal Data columns are - fuel and brand

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [5]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

# 1. One Hot Encoding using Pandas

In [6]:
pd.get_dummies(df, columns = ['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# 2. K-1 One Hot Encoding

In [7]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# 3. One Hot Encoding using sklearn

In [11]:
X = df.iloc[:, 0:4]
y = df.iloc[:,-1]


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=2
)

In [15]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [16]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
ohe = OneHotEncoder(drop='first', sparse_output=False,dtype=np.int32)

In [28]:
X_train_new = ohe.fit_transform(X_train[["fuel", "owner"]])

In [29]:
X_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], shape=(6502, 7), dtype=int32)

In [30]:
np.hstack((X_train[['brand','km_driven']].values, X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], shape=(6502, 9), dtype=object)

# 4. One Hot Encoding with Top Categories

In [33]:
df["brand"].value_counts()
counts = df["brand"].value_counts()

In [34]:
df["brand"].nunique()

32

In [35]:
threshold = 100

In [36]:
counts <= 100

brand
Maruti           False
Hyundai          False
Mahindra         False
Tata             False
Toyota           False
Honda            False
Ford             False
Chevrolet        False
Renault          False
Volkswagen       False
BMW              False
Skoda            False
Nissan            True
Jaguar            True
Volvo             True
Datsun            True
Mercedes-Benz     True
Fiat              True
Audi              True
Lexus             True
Jeep              True
Mitsubishi        True
Force             True
Land              True
Isuzu             True
Kia               True
Ambassador        True
Daewoo            True
MG                True
Ashok             True
Opel              True
Peugeot           True
Name: count, dtype: bool

In [38]:
counts[counts <= 100].index

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [39]:
repl = counts[counts <= 100].index

In [40]:
pd.get_dummies(df['brand'].replace(repl,'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
7145,False,False,False,False,False,False,False,False,False,True,False,False,False
6189,False,False,False,False,False,False,False,False,False,True,False,False,False
4112,False,False,False,False,False,False,False,False,False,True,False,False,False
723,False,False,False,False,False,False,False,False,False,True,False,False,False
6233,False,True,False,False,False,False,False,False,False,False,False,False,False


# One Hot Encoding - Simple Example

In [47]:
import pandas as pd

# Sample dataset
data = {
    "Fuel": ["Petrol", "Diesel", "CNG", "Petrol", "Diesel"],
    "Owner": ["First", "Second", "First", "Third", "Second"],
    "Mileage": [15, 18, 20, 14, 19],
}

df = pd.DataFrame(data)
print(df)

     Fuel   Owner  Mileage
0  Petrol   First       15
1  Diesel  Second       18
2     CNG   First       20
3  Petrol   Third       14
4  Diesel  Second       19


In [48]:
from sklearn.preprocessing import OneHotEncoder

# Create the encoder
ohe = OneHotEncoder(drop='first', sparse_output=False)  # drop=None keeps all categories

# Fit and transform the categorical columns
encoded_array = ohe.fit_transform(df[["Fuel", "Owner"]])

In [49]:
# Convert to DataFrame with column names
encoded_df = pd.DataFrame(
    encoded_array, columns=ohe.get_feature_names_out(["Fuel", "Owner"])
)

print(encoded_df)

   Fuel_Diesel  Fuel_Petrol  Owner_Second  Owner_Third
0          0.0          1.0           0.0          0.0
1          1.0          0.0           1.0          0.0
2          0.0          0.0           0.0          0.0
3          0.0          1.0           0.0          1.0
4          1.0          0.0           1.0          0.0


In [46]:
# Combine with original numeric column
final_df = pd.concat([df[["Mileage"]], encoded_df], axis=1)
print(final_df)

   Mileage  Fuel_CNG  Fuel_Diesel  Fuel_Petrol  Owner_First  Owner_Second  \
0       15       0.0          0.0          1.0          1.0           0.0   
1       18       0.0          1.0          0.0          0.0           1.0   
2       20       1.0          0.0          0.0          1.0           0.0   
3       14       0.0          0.0          1.0          0.0           0.0   
4       19       0.0          1.0          0.0          0.0           1.0   

   Owner_Third  
0          0.0  
1          0.0  
2          0.0  
3          1.0  
4          0.0  
