#### Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')

#### Importing the Dataset

In [2]:
cars = pd.read_csv('cars.csv')
cars.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


#### ONEHOT ENCODING USING PANDAS

In [3]:
pd.get_dummies(cars, columns = ['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


#### ONEHOT ENCODING USING PANDAS (K - 1 DUMMIES)

In [4]:
pd.get_dummies(cars, columns = ['fuel', 'owner'], drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


#### ONEHOT ENCODING USING SCIKIT LEARN

#### Splitting the data into train and test data

In [5]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(cars.iloc[:,0:4], cars.iloc[:,-1], test_size=0.2, random_state=2)

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

x_train_new = encoder.fit_transform(x_train[['fuel','owner']])
x_test_new = encoder.transform(x_test[['fuel','owner']])

In [7]:
np.hstack((x_train[['brand','km_driven']].values,x_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

#### ONEHOT ENCODING WITH TOP CATEGORIES

In [13]:
counts = cars['brand'].value_counts()

In [12]:
cars['brand'].nunique()
threshold = 120

In [15]:
change = counts[counts <= threshold].index

In [19]:
pd.get_dummies(cars['brand'].replace(change, 'Rares')).sample(5)

Unnamed: 0,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,RareCars,Renault,Tata,Toyota,Volkswagen
3983,1,0,0,0,0,0,0,0,0,0,0
8123,0,0,0,1,0,0,0,0,0,0,0
6939,0,0,0,0,0,1,0,0,0,0,0
1875,0,0,0,0,0,0,0,0,1,0,0
8077,0,0,0,0,0,0,0,0,0,1,0
