In [63]:
from math import remainder

import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [64]:
df = pd.read_csv('../data/cars.csv')

In [65]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [66]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [67]:
df['brand'].nunique()

32

In [68]:
df['owner'].nunique()

5

In [69]:
from sklearn.model_selection import train_test_split

x = df[['brand', 'km_driven', 'fuel', 'owner']]

y = df['selling_price']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [70]:
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[['Diesel', 'Petrol', 'CNG', 'LPG']]), ['fuel']),
    ('tnf2', OneHotEncoder(sparse_output = False, handle_unknown='ignore'), ['brand', 'owner'])
],remainder='passthrough')

In [74]:
X_train_transformed = transformer.fit_transform(X_train)

In [72]:
transformer.transform(X_test).shape

(1626, 38)

In [76]:
pd.DataFrame(X_train_transformed, columns=transformer.get_feature_names_out()).head()

Unnamed: 0,tnf1__fuel,tnf2__brand_Ambassador,tnf2__brand_Ashok,tnf2__brand_Audi,tnf2__brand_BMW,tnf2__brand_Chevrolet,tnf2__brand_Daewoo,tnf2__brand_Datsun,tnf2__brand_Fiat,tnf2__brand_Force,...,tnf2__brand_Tata,tnf2__brand_Toyota,tnf2__brand_Volkswagen,tnf2__brand_Volvo,tnf2__owner_First Owner,tnf2__owner_Fourth & Above Owner,tnf2__owner_Second Owner,tnf2__owner_Test Drive Car,tnf2__owner_Third Owner,remainder__km_driven
0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,60000.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,150000.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,110000.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,28000.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,15000.0
