In [52]:
import pandas as pd
import numpy as np

In [53]:
df = pd.read_csv('cars.csv')
df.sample()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
3413,Toyota,68089,Petrol,First Owner,2000000


# using pandas

In [54]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# using sklearn

In [55]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df.drop(['selling_price'],axis=1),
                                                 df.selling_price,
                                                 test_size=0.3,
                                                 random_state=0)

X_train.shape,X_test.shape

((5689, 4), (2439, 4))

# here applying encoding on fuel and owner first

separating data frames

In [56]:
X_train_new = X_train.iloc[:,2:]
X_test_new = X_test[['fuel','owner']]

X_train_new.describe()

Unnamed: 0,fuel,owner
count,5689,5689
unique,4,5
top,Diesel,First Owner
freq,3075,3695


# using one hot encoder

In [57]:
# using one hot encoder
from sklearn.preprocessing import OneHotEncoder

# either use spars_output=False
ohe = OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)

X_train_new_encoded = ohe.fit_transform(X_train_new)
X_test_new_encoded = ohe.transform(X_test_new)

# or use .toarray()
# ohe = OneHotEncoder()

# X_train_new_encoded = ohe.fit_transform(X_train_new).toarray()
# X_test_new_encoded = ohe.transform(X_test_new).toarray()

In [58]:
X_train_new_encoded

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int32)

In [59]:
# concat both data
X_train_encoded = np.hstack((X_train[['brand','km_driven']].values,X_train_new_encoded))
X_train_encoded.view()


array([['Tata', 20000, 0, ..., 0, 0, 0],
       ['Maruti', 30000, 0, ..., 0, 0, 0],
       ['Maruti', 15000, 0, ..., 0, 0, 0],
       ...,
       ['Hyundai', 90000, 0, ..., 1, 0, 0],
       ['Volkswagen', 90000, 1, ..., 0, 0, 0],
       ['Hyundai', 110000, 0, ..., 0, 0, 0]], dtype=object)

# Ohe for top frequencies

In [87]:
counts = df['brand'].value_counts()
counts

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Ambassador          4
Kia                 4
MG                  3
Daewoo              3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [69]:
df['brand'].nunique()

32

In [75]:
threshold = 100
mask = (counts <= threshold)
repl = counts[mask].index
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Land', 'Force', 'Isuzu', 'Ambassador',
       'Kia', 'MG', 'Daewoo', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [83]:
df_brand= df['brand'].replace(repl,'uncommon')
df_brand.value_counts()

brand
Maruti        2448
Hyundai       1415
Mahindra       772
Tata           734
uncommon       538
Toyota         488
Honda          467
Ford           397
Chevrolet      230
Renault        228
Volkswagen     186
BMW            120
Skoda          105
Name: count, dtype: int64

In [89]:
pd.get_dummies(df_brand,drop_first=True)

Unnamed: 0,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,True,False,False,False


In [92]:
df_brand.nunique()

13