In [1]:
# import NumPy for numerical operations
import numpy as np

# import pandas for data manipulation
import pandas as pd

In [2]:
# load the cars dataset from a csv file
df = pd.read_csv('cars.csv')

In [3]:
# display the first five rows of the DataFrame
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
# count the number of unique values in the 'brand' column
df['brand'].nunique()

32

In [5]:
# show the count of each unique category in the 'fuel' column
df['fuel'].value_counts()

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,4402
Petrol,3631
CNG,57
LPG,38


In [6]:
# show the count of each category in the 'owner' column
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


## **1. OneHotEncoding using Pandas**

In [7]:
# perform one-hot encoding on 'fuel' and 'owner' columns
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


## **2. (k-1) OneHotEncding**

In [8]:
# one-hot encode 'fuel' and 'owner' columns, dropping the first category to avoid multicollinearity
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


## **3. OneHotEncoding using Scikitlearn**

Its better to use OneHotEncoding from scikit-learn, as it remember what binary vectors it assigned to each categories unlike pandas

In [9]:
# import train_test_split function
from sklearn.model_selection import train_test_split

# split features (columns 0 to 3) and target (last column) into train and test sets
# 80% for training, 20% for testing, with fixed random state for reproducibility
x_train, x_test, y_train, y_test = train_test_split(
    df.iloc[:, 0:4],     # select first 4 columns as features
    df.iloc[:, -1],      # select last column as target
    test_size=0.2,       # 20% data for testing
    random_state=2       # set seed for reproducibility
)

In [10]:
# display the first five rows of x_train
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [11]:
# import OneHotEncoder to convert categorical variables into binary (one-hot) format
from sklearn.preprocessing import OneHotEncoder

In [12]:
# drop first category to avoid dummy variable trap
# sparse_output False ensures output is not in compressed format
# compressed format can cause issues while joining with hstack
# dtype sets the datatype, default is float
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)

In [13]:
# fit and apply one-hot encoding to 'fuel' and 'owner' columns in training data
x_train_new = ohe.fit_transform(x_train[['fuel', 'owner']])

# apply the same one-hot encoding to 'fuel' and 'owner' columns in test data
x_test_new = ohe.transform(x_test[['fuel', 'owner']])

In [14]:
# display the one-hot encoded training feature data
x_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [15]:
# check the shape of the one-hot encoded training data
x_train_new.shape

(6502, 7)

In [16]:
# combine numerical features ('brand', 'km_driven') with one-hot encoded features
np.hstack((x_train[['brand', 'km_driven']].values, x_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

## **4. OneHotEncoding with Top Categories**

In [17]:
# display the count of each unique value in the 'brand' column
df['brand'].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186


In [18]:
# store the frequency count of each unique brand in the 'counts' variable
counts = df['brand'].value_counts()

In [19]:
# count the number of unique values in the 'brand' column
df['brand'].nunique()

# set threshold value for filtering rare brands
threshold = 100

In [20]:
# get list of brand names with frequency less than or equal to the threshold
repl = counts[counts <= threshold].index

In [21]:
# replace rare brands with 'uncommon', then one-hot encode the 'brand' column and display 5 random rows
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
1598,False,False,False,False,True,False,False,False,False,False,False,False,False
3716,False,False,False,False,False,False,False,False,False,True,False,False,False
3033,False,False,False,False,True,False,False,False,False,False,False,False,False
239,False,False,False,False,False,True,False,False,False,False,False,False,False
6130,False,False,False,False,False,False,False,False,False,True,False,False,False
