In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 10)
np.set_printoptions(threshold=15, precision=3, suppress=True)

In [28]:
df=pd.read_csv("cars.csv")

In [29]:
df

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
...,...,...,...,...,...
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000
8127,Tata,25000,Diesel,First Owner,290000


In [30]:
df['brand'].value_counts()

brand
Maruti      2448
Hyundai     1415
Mahindra     772
Tata         734
            ... 
Daewoo         3
Ashok          1
Opel           1
Peugeot        1
Name: count, Length: 32, dtype: int64

In [31]:
df['brand'].nunique()

32

In [32]:
df.columns

Index(['brand', 'km_driven', 'fuel', 'owner', 'selling_price'], dtype='object')

In [33]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

#  **ONE HOT ENCODING** using Pandas

In [34]:


pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,...,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,...,True,False,False,False,False
1,Skoda,120000,370000,False,True,...,False,False,True,False,False
2,Honda,140000,158000,False,False,...,False,False,False,False,True
3,Hyundai,127000,225000,False,True,...,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
8124,Hyundai,119000,135000,False,True,...,False,True,False,False,False
8125,Maruti,120000,382000,False,True,...,True,False,False,False,False
8126,Tata,25000,290000,False,True,...,True,False,False,False,False
8127,Tata,25000,290000,False,True,...,True,False,False,False,False


# **k-1 ONE HOT ENCODING**

In [35]:
pd.get_dummies(df, columns=['fuel', 'owner'],drop_first=True)


Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False
8127,Tata,25000,290000,True,False,False,False,False,False,False


## **ONE HOT ENCODING** using Scikit learn 

In [36]:
df

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
...,...,...,...,...,...
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000
8127,Tata,25000,Diesel,First Owner,290000


In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.2, random_state=0)

In [56]:
X_train

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
...,...,...,...,...
3264,Ford,100000,Diesel,Second Owner
1653,Hyundai,90000,Petrol,Second Owner
2607,Volkswagen,90000,Diesel,First Owner
2732,Hyundai,110000,Petrol,First Owner


In [57]:
from sklearn.preprocessing import OneHotEncoder

In [68]:
#ohe=OneHotEncoder(drop=None, sparse=True)    ## this is the default parameter of the OnehotEncoder

## to remove the first column and direct convert into the numpy array the use sparse_output false 
ohe=OneHotEncoder(drop='first', sparse_output=False,dtype=np.int32)

In [69]:
## X_train_new=ohe.fit_transform(X_train[['fuel','owner']]).toarray()
X_train_new=ohe.fit_transform(X_train[['fuel','owner']])

In [70]:
X_test_new=ohe.fit_transform(X_test[['fuel','owner']])

In [71]:
X_train_new

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], shape=(6502, 7), dtype=int32)

In [72]:
X_test_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]], shape=(1626, 7), dtype=int32)

In [73]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Tata', 150000, 1, ..., 0, 0, 1],
       ['Hyundai', 110000, 1, ..., 1, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]],
      shape=(6502, 9), dtype=object)

In [74]:
df

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
...,...,...,...,...,...
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000
8127,Tata,25000,Diesel,First Owner,290000


In [78]:
counts=df['brand'].value_counts()

In [82]:
counts

brand
Maruti      2448
Hyundai     1415
Mahindra     772
Tata         734
            ... 
Daewoo         3
Ashok          1
Opel           1
Peugeot        1
Name: count, Length: 32, dtype: int64

In [84]:
print(counts['Maruti'])

2448


In [81]:
print(df['brand'].nunique())
threshold=100

32


In [85]:
counts[counts<=threshold].index

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Land', 'Force', 'Isuzu', 'Ambassador',
       'Kia', 'MG', 'Daewoo', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [86]:
repl=counts[counts<=threshold].index

In [87]:
pd.get_dummies(df['brand'].replace(repl,'Uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,...,Skoda,Tata,Toyota,Uncommon,Volkswagen
0,False,False,False,False,False,...,False,False,False,False,False
1,False,False,False,False,False,...,True,False,False,False,False
2,False,False,False,True,False,...,False,False,False,False,False
3,False,False,False,False,True,...,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
8124,False,False,False,False,True,...,False,False,False,False,False
8125,False,False,False,False,False,...,False,False,False,False,False
8126,False,False,False,False,False,...,False,True,False,False,False
8127,False,False,False,False,False,...,False,True,False,False,False


In [93]:
pd.get_dummies(df['brand'].replace(repl,'Uncommon')).sample(5)


Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,...,Skoda,Tata,Toyota,Uncommon,Volkswagen
1399,False,False,False,False,False,...,False,False,True,False,False
6262,False,False,False,False,False,...,False,False,False,False,False
2283,False,False,False,False,False,...,False,True,False,False,False
5915,False,False,False,False,False,...,False,False,False,False,False
7454,False,False,True,False,False,...,False,False,False,False,False
