In [4]:
import numpy as np
import pandas as pd

In [5]:
df=pd.read_csv("./datasets/cars.csv")

In [6]:
df.shape

(8128, 5)

In [7]:
df.head(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [9]:
df.isnull().sum()

brand            0
km_driven        0
fuel             0
owner            0
selling_price    0
dtype: int64

In [10]:
df.duplicated().sum()

1678

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.shape

(6450, 5)

In [13]:
np.round(df.describe(),1)

Unnamed: 0,km_driven,selling_price
count,6450.0,6450.0
mean,74469.0,524229.7
std,59633.4,534126.7
min,1.0,29999.0
25%,39000.0,250000.0
50%,70000.0,409999.0
75%,100000.0,640000.0
max,2360457.0,10000000.0


In [14]:
df["brand"].value_counts()

Maruti           1954
Hyundai          1176
Mahindra          648
Tata              599
Honda             354
Ford              352
Toyota            349
Chevrolet         212
Renault           192
Volkswagen        170
Nissan             73
Skoda              69
Datsun             55
Mercedes-Benz      46
BMW                46
Fiat               44
Audi               33
Jeep               22
Mitsubishi         11
Volvo               9
Jaguar              8
Isuzu               4
Ambassador          4
Force               4
Kia                 3
Land                3
Daewoo              3
MG                  3
Ashok               1
Lexus               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [15]:
df["fuel"].value_counts()

Diesel    3519
Petrol    2838
CNG         55
LPG         38
Name: fuel, dtype: int64

In [16]:
df["owner"].value_counts()

First Owner             3916
Second Owner            1851
Third Owner              511
Fourth & Above Owner     167
Test Drive Car             5
Name: owner, dtype: int64

# Split the data

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,y_train,y_test=train_test_split(df.iloc[:,0:4],df.iloc[:,4:5])

In [19]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((4837, 4), (1613, 4), (4837, 1), (1613, 1))

# One Hot Encoding using pandas

In [20]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8121,Maruti,50000,260000,0,0,0,1,0,0,1,0,0
8122,Hyundai,80000,475000,0,1,0,0,0,0,1,0,0
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0


# (k-1) hot encoding

If we have k-categories in the column then it will do one-hot-encoding in such that (k-1) new column will create 

We don't use the one-hot encoding through pandas in machine learning project, because it doen't remember the position of the column in dataframe. If we perform the one-hot encoding many times the, at every time position of the column would be different.

In [21]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8121,Maruti,50000,260000,0,0,1,0,1,0,0
8122,Hyundai,80000,475000,1,0,0,0,1,0,0
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0


# one-hot-encoding through sklearn

In [22]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
ohe=OneHotEncoder(drop='first')

In [43]:
X_train

Unnamed: 0,brand,km_driven,fuel,owner
532,Maruti,40000,Petrol,Second Owner
179,Hyundai,70000,Petrol,First Owner
4150,Tata,90000,Diesel,Third Owner
2222,Hyundai,42000,Petrol,First Owner
3180,Tata,40000,Petrol,First Owner
...,...,...,...,...
5549,Mahindra,60000,Diesel,Second Owner
7742,Hyundai,20000,Petrol,First Owner
4628,Maruti,60000,Petrol,Third Owner
2706,Ford,60000,Petrol,Second Owner


In [46]:
X_train
X_train.iloc[:,2:4]

Unnamed: 0,fuel,owner
532,Petrol,Second Owner
179,Petrol,First Owner
4150,Diesel,Third Owner
2222,Petrol,First Owner
3180,Petrol,First Owner
...,...,...
5549,Diesel,Second Owner
7742,Petrol,First Owner
4628,Petrol,Third Owner
2706,Petrol,Second Owner


In [47]:
ohe.fit(X_train.iloc[:,2:4])
X_train_new=ohe.transform(X_train.iloc[:,2:4]).toarray()

In [48]:
X_test_new=ohe.transform(X_test.iloc[:,2:4]).toarray()

In [49]:
X_train.iloc[:,:2].values.shape

(4837, 2)

In [52]:
X_train=np.hstack((X_train.iloc[:,:2].values,X_train_new))

In [53]:
X_train.shape

(4837, 9)

In [57]:
X_test=np.hstack((X_test.iloc[:,:2].values,X_test_new))

In [58]:
X_test.shape

(1613, 9)

# One Hot Encoding With Top Categories

In [69]:
counts=df['brand'].value_counts()
df['brand'].nunique()
threshold=100
counts

Maruti           1954
Hyundai          1176
Mahindra          648
Tata              599
Honda             354
Ford              352
Toyota            349
Chevrolet         212
Renault           192
Volkswagen        170
Nissan             73
Skoda              69
Datsun             55
Mercedes-Benz      46
BMW                46
Fiat               44
Audi               33
Jeep               22
Mitsubishi         11
Volvo               9
Jaguar              8
Isuzu               4
Ambassador          4
Force               4
Kia                 3
Land                3
Daewoo              3
MG                  3
Ashok               1
Lexus               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [75]:
repl=counts[counts<=threshold].index


In [76]:
pd.get_dummies(df['brand'].replace(repl,'uncommon'))

Unnamed: 0,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
8121,0,0,0,0,0,1,0,0,0,0,0
8122,0,0,0,1,0,0,0,0,0,0,0
8123,0,0,0,1,0,0,0,0,0,0,0
8124,0,0,0,1,0,0,0,0,0,0,0
