<a href="https://colab.research.google.com/github/Fliptoss/ML_stuff/blob/main/OneHotEncoding_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/content/cars.csv')

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


OneHot encoding using pandas

In [6]:
pd.get_dummies(df, columns=['fuel', 'owner'],dtype=int)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


Now we need to perform K-1 OneHotEncoding

In [9]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True, dtype=int)

## here we are dropping the first columns as we do not want to create a multicollinary issue

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


Now the main issue that arises is that Pandas will not remember what it did before. For that reason, there is a Sklearn library which we will use

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df['selling_price'], test_size=0.2, random_state=0)

In [14]:
ohe = OneHotEncoder(drop='first', dtype=np.int32)

In [15]:
X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']])

In [16]:
X_train_new

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 8735 stored elements and shape (6502, 7)>

In [17]:
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

Now we need to merge them. However, merging is hard here. Therefore, we will merge it using hstack

In [20]:
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new.toarray()))

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Tata', 150000, 1, ..., 0, 0, 1],
       ['Hyundai', 110000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]], dtype=object)

In [23]:
## there are values which have a lesser threshold. it is required to add them into a different category "uncommon"

counts = df['brand'].value_counts()

In [24]:
counts

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186


In [27]:
df['brand'].nunique()
threshold = 100

In [34]:
replace = counts[counts<threshold].index
pd.get_dummies(df['brand'].replace(replace, 'uncommon'), dtype=int).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
3528,0,0,0,0,0,0,1,0,0,0,0,0,0
5385,0,0,0,0,0,1,0,0,0,0,0,0,0
1810,0,0,0,0,0,1,0,0,0,0,0,0,0
6577,0,0,0,0,0,0,0,0,0,0,0,0,1
7215,0,0,0,0,0,0,0,0,0,1,0,0,0
