In [48]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv(r"../Dataset/car-details.csv")

In [9]:
df.head(5)

Unnamed: 0,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti Swift Dzire VDI,Maruti,Swift,Dzire VDI,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda Rapid 1.5 TDI Ambition,Skoda,Rapid,1.5 TDI Ambition,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda City 2017-2020 EXi,Honda,City,2017-2020 EXi,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai i20 Sportz Diesel,Hyundai,i20,Sportz Diesel,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti Swift VXI BSIII,Maruti,Swift,VXI BSIII,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


In [10]:
df.shape

(6926, 16)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6926 non-null   object 
 1   company        6926 non-null   object 
 2   model          6926 non-null   object 
 3   edition        6926 non-null   object 
 4   year           6926 non-null   int64  
 5   owner          6926 non-null   object 
 6   fuel           6926 non-null   object 
 7   seller_type    6926 non-null   object 
 8   transmission   6926 non-null   object 
 9   km_driven      6926 non-null   int64  
 10  mileage_mpg    6718 non-null   float64
 11  engine_cc      6718 non-null   float64
 12  max_power_bhp  6717 non-null   float64
 13  torque_nm      6717 non-null   float64
 14  seats          6718 non-null   float64
 15  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 865.9+ KB


In [12]:
df.isnull().sum()

name               0
company            0
model              0
edition            0
year               0
owner              0
fuel               0
seller_type        0
transmission       0
km_driven          0
mileage_mpg      208
engine_cc        208
max_power_bhp    209
torque_nm        209
seats            208
selling_price      0
dtype: int64

In [22]:
for col in df.select_dtypes(include='O').columns:
    print("Column",col)
    print("Count of Unique Values:",df[col].nunique())
    # print(df[col].value_counts(normalize=True))

Column name
Count of Unique Values: 2058
Column company
Count of Unique Values: 32
Column model
Count of Unique Values: 207
Column edition
Count of Unique Values: 1916
Column owner
Count of Unique Values: 5
Column fuel
Count of Unique Values: 4
Column seller_type
Count of Unique Values: 3
Column transmission
Count of Unique Values: 2


In [24]:
df = df.drop(columns=['name','edition','model'])

In [36]:
Q1 = df['selling_price'].quantile(0.25)
Q3 = df['selling_price'].quantile(0.75)
IQR = Q3-Q1 
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR

df[(df['selling_price'] > upper_bound) | (df['selling_price'] < lower_bound)]

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
41,Jeep,2019,First,Petrol,Individual,Automatic,5000,37.60,1368.0,160.77,250.0,5.0,2100000
47,Toyota,2014,First,Diesel,Dealer,Manual,77000,29.50,2982.0,168.50,343.0,7.0,1500000
49,Mercedes-Benz,2014,Second,Diesel,Dealer,Automatic,27800,34.78,2143.0,120.70,200.0,5.0,1450000
55,Toyota,2016,Second,Diesel,Dealer,Automatic,127700,26.70,2755.0,171.50,360.0,7.0,1650000
57,Audi,2013,Second,Diesel,Dealer,Automatic,33900,41.56,1968.0,174.33,380.0,5.0,1750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6772,Hyundai,2018,First,Petrol,Individual,Manual,11000,37.12,1591.0,121.30,151.0,5.0,1220000
6784,Renault,2018,First,Diesel,Individual,Manual,12000,47.88,1461.0,108.45,240.0,5.0,1265000
6788,BMW,2011,First,Diesel,Individual,Automatic,84925,37.75,1995.0,181.00,380.0,5.0,1500000
6859,Audi,2017,First,Petrol,Dealer,Automatic,8000,39.00,1798.0,177.50,250.0,5.0,1689999


In [38]:
df = df.drop_duplicates()

In [40]:
df

Unnamed: 0,company,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,2014,First,Diesel,Individual,Manual,145500,55.00,1248.0,74.00,190.000000,5.0,450000
1,Skoda,2014,Second,Diesel,Individual,Manual,120000,49.70,1498.0,103.52,250.000000,5.0,370000
2,Honda,2006,Third,Petrol,Individual,Manual,140000,41.60,1497.0,78.00,124.544455,5.0,158000
3,Hyundai,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.00,219.668960,5.0,225000
4,Maruti,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.20,112.776475,5.0,130000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6921,Maruti,2013,Second,Petrol,Individual,Manual,50000,44.40,998.0,67.10,90.000000,5.0,260000
6922,Hyundai,2014,Second,Diesel,Individual,Manual,80000,52.97,1396.0,88.73,219.700000,5.0,475000
6923,Hyundai,2013,First,Petrol,Individual,Manual,110000,43.47,1197.0,82.85,113.700000,5.0,320000
6924,Hyundai,2007,Fourth & Above,Diesel,Individual,Manual,119000,39.47,1493.0,110.00,235.359600,5.0,135000


In [42]:
df.duplicated().sum()

0

# Creating Pipelines Using Col Transformer

In [46]:
X = df.drop(columns='selling_price')
y = df.selling_price.copy()

print(X.shape, y.shape)

(6907, 12) (6907,)


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5525, 12) (5525,)
(1382, 12) (1382,)


In [60]:
num_cols = X_train.select_dtypes(include='number').columns.tolist()

In [62]:
cat_cols = [x for x in X_train.columns.tolist() if x not in num_cols]

In [74]:
num_pipe = Pipeline(
    steps = [
        ('num_preprocess', SimpleImputer(strategy='median')),
        ('num_transform',StandardScaler())
    ]
)

cat_pipe = Pipeline(
    steps = [
        ('cat_preprocess' , SimpleImputer(strategy='most_frequent')),
        ('cat_transform' , OneHotEncoder(handle_unknown = 'ignore',sparse_output=False))
    ]
)

preprocess = ColumnTransformer(
    transformers = [
        ('num',num_pipe,num_cols),
        ('cat',cat_pipe , cat_cols)
    ]
)

regressor = RandomForestRegressor(
    n_estimators=100, max_depth=5, random_state=42
)

rf_model = Pipeline(
    steps = [
        ('preprocess',preprocess),
        ('model_',regressor)
    ]
)

rf_model.fit(X_train,y_train)

In [75]:
y_train_pred = rf_model.predict(X_train)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
print(f'Train RMSE: {train_rmse:,.3f}')

y_test_pred = rf_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print(f'Test RMSE: {test_rmse:,.3f}')

Train RMSE: 165,369.548
Test RMSE: 163,659.571
