In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('cars.csv')
df.sample(3)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
3550,Fiat,100000,Diesel,First Owner,360000
7642,Chevrolet,70000,LPG,First Owner,300000
1942,Maruti,110000,Petrol,Second Owner,200000


In [6]:
print('Total Brands:', df['brand'].nunique())
df['brand'].value_counts().head(4)

Total Brands: 32


brand
Maruti      2448
Hyundai     1415
Mahindra     772
Tata         734
Name: count, dtype: int64

## **OneHotEncoding using Pandas**

In [8]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


## **N-1 OneHotEncoding**

In [9]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


## **OneHotEncoding using SKlearn**

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :4], df.iloc[:, -1], test_size=0.3)

In [11]:
X_train.head(2)

Unnamed: 0,brand,km_driven,fuel,owner
2686,Mahindra,100000,Diesel,Second Owner
78,Tata,70000,Diesel,First Owner


In [13]:
from sklearn.preprocessing import OneHotEncoder

# 'drop' to remove first coln
# 'sparse' to make this directly to nd.array without explicit typing .to_array()
# 'dtype' to convert float to int32
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32)

In [14]:
# train and transforming at once
X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']])
X_test_new = ohe.fit(X_test[['fuel', 'owner']])



In [17]:
print(X_train_new.shape)
X_train_new

(5689, 7)


array([[1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]], dtype=int32)

In [20]:
X_train[['brand', 'km_driven']].values

array([['Mahindra', 100000],
       ['Tata', 70000],
       ['Maruti', 28000],
       ...,
       ['Honda', 25000],
       ['Hyundai', 52000],
       ['Chevrolet', 80000]], dtype=object)

In [19]:
# combine 'X_train' and 'X_train_new'
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))

array([['Mahindra', 100000, 1, ..., 1, 0, 0],
       ['Tata', 70000, 1, ..., 0, 0, 0],
       ['Maruti', 28000, 0, ..., 0, 0, 0],
       ...,
       ['Honda', 25000, 0, ..., 1, 0, 0],
       ['Hyundai', 52000, 0, ..., 0, 0, 0],
       ['Chevrolet', 80000, 1, ..., 1, 0, 0]], dtype=object)

## **OneHotEncoding with Top Categories**

In [22]:
brand_counts = df['brand'].value_counts()
threshold = 100

In [23]:
# getting the names of the brands whose sales are lesser than 100 
repl = brand_counts[brand_counts <= threshold].index

In [25]:
pd.get_dummies(df['brand'].replace(repl, 'Uncommon')).sample(4)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Uncommon,Volkswagen
6794,False,False,False,False,False,False,False,False,False,True,False,False,False
2755,False,False,False,False,False,False,True,False,False,False,False,False,False
3844,False,False,False,False,True,False,False,False,False,False,False,False,False
4091,False,False,False,False,False,False,False,False,False,False,False,True,False
