# Second Hand Car Price Prediction

In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import joblib
import warnings

In [79]:
warnings.filterwarnings("ignore")

In [80]:
df = pd.read_csv("Cars_24.csv")

In [81]:
df.head(2)

Unnamed: 0,selling_price,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
0,1.2,2012.0,120000,19.7,796.0,46.3,11.0,MARUTI,ALTO STD,1,0,0,0,0,1,1,1,0
1,5.5,2016.0,20000,18.9,1197.0,82.0,7.0,HYUNDAI,GRAND I10 ASTA,1,0,0,0,0,1,1,1,0


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19820 entries, 0 to 19819
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   selling_price     19820 non-null  float64
 1   year              19820 non-null  float64
 2   km_driven         19820 non-null  int64  
 3   mileage           19820 non-null  float64
 4   engine            19820 non-null  float64
 5   max_power         19820 non-null  float64
 6   age               19820 non-null  float64
 7   make              19820 non-null  object 
 8   model             19820 non-null  object 
 9   Individual        19820 non-null  int64  
 10  Trustmark Dealer  19820 non-null  int64  
 11  Diesel            19820 non-null  int64  
 12  Electric          19820 non-null  int64  
 13  LPG               19820 non-null  int64  
 14  Petrol            19820 non-null  int64  
 15  Manual            19820 non-null  int64  
 16  5                 19820 non-null  int64 

In [83]:
df.rename({"Trustmark Dealer":"Trustmark_Dealer"}, axis=1, inplace=True)

In [84]:
x = df.iloc[:,1:]
x.head()

Unnamed: 0,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark_Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
0,2012.0,120000,19.7,796.0,46.3,11.0,MARUTI,ALTO STD,1,0,0,0,0,1,1,1,0
1,2016.0,20000,18.9,1197.0,82.0,7.0,HYUNDAI,GRAND I10 ASTA,1,0,0,0,0,1,1,1,0
2,2010.0,60000,17.0,1197.0,80.0,13.0,HYUNDAI,I20 ASTA,1,0,0,0,0,1,1,1,0
3,2012.0,37000,20.92,998.0,67.1,11.0,MARUTI,ALTO K10 2010-2014 VXI,1,0,0,0,0,1,1,1,0
4,2015.0,30000,22.77,1498.0,98.59,8.0,FORD,ECOSPORT 2015-2021 1.5 TDCI TITANIUM BSIV,0,0,1,0,0,0,1,1,0


In [85]:
x.shape

(19820, 17)

In [86]:
y = df["selling_price"]
y.head()

0    1.20
1    5.50
2    2.15
3    2.26
4    5.70
Name: selling_price, dtype: float64

In [87]:
y.shape

(19820,)

In [88]:
from sklearn.model_selection import train_test_split

In [89]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [90]:
x_train.head()

Unnamed: 0,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark_Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
11519,2012.0,120000,12.99,2494.0,100.6,11.0,TOYOTA,INNOVA 2.5 G (DIESEL) 8 SEATER,1,0,1,0,0,0,1,0,1
10241,2009.0,50000,17.3,1061.0,57.5,14.0,MARUTI,WAGON R LXI DUO BSIII,1,0,0,0,1,0,1,1,0
15309,2015.0,55000,18.19,1197.0,103.2,8.0,VOLKSWAGEN,VENTO 1.2 TSI HIGHLINE AT,0,0,0,0,0,1,0,1,0
12025,2013.0,57001,20.77,1248.0,88.76,10.0,MARUTI,ERTIGA VDI,0,0,1,0,0,0,1,0,1
2468,2017.0,66000,31.79,998.0,58.2,6.0,MARUTI,CELERIO GREEN VXI OPTIONAL,1,0,0,0,0,0,1,1,0


In [91]:
y_train.head()

11519    8.00
10241    1.45
15309    5.99
12025    4.49
2468     4.50
Name: selling_price, dtype: float64

In [92]:
x_test.shape

(3964, 17)

In [93]:
y_test.shape

(3964,)

In [94]:
df["make"].nunique()

41

In [95]:
df["model"].nunique()

3233

In [96]:
# !pip install category_encoders

In [97]:
from category_encoders import TargetEncoder
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge

In [98]:
# Pipeline with smoothed TargetEncoder
pipeline = Pipeline([ ("Target_encode", TargetEncoder(cols=["make","model"], smoothing=0.3, min_samples_leaf=20)),
                      ("scaler", StandardScaler()), 
                      ("poly", PolynomialFeatures(degree=2, include_bias=False)),
                      ("ridge", Ridge(alpha=5.5))
                    ])

In [99]:
pipeline.fit(x_train,y_train)

### Final transformed features after encoding , scaling and poly

In [102]:
transformed_x_train = pipeline[:-1].transform(x_train)
print("Shape - ",transformed_x_train.shape)
print("transformed_x_trained data -> ",transformed_x_train[:6])

Shape -  (15856, 170)
transformed_x_trained data ->  [[-0.7965532   1.18108809 -1.53799669 ...  5.03348554 -5.28642077
   5.55206612]
 [-1.73239697 -0.15702706 -0.51714229 ...  0.19866949 -0.18916391
   0.18011313]
 [ 0.13929057 -0.0614474  -0.30633941 ...  0.19866949 -0.18916391
   0.18011313]
 [-0.48460528 -0.02319643  0.30475209 ...  5.03348554 -5.28642077
   5.55206612]
 [ 0.76318642  0.14882783  2.91491811 ...  0.19866949 -0.18916391
   0.18011313]
 [ 0.13929057  0.58012146  0.92768645 ...  0.19866949 -0.18916391
   0.18011313]]


## Testing data on higher degree

In [103]:
x_test_pred_poly = pipeline.predict(x_test)
x_test_pred_poly

array([4.03617851, 9.87317549, 5.8260795 , ..., 5.48661766, 9.90939232,
       3.45700076], shape=(3964,))

In [104]:
pipeline.score(x_train,y_train)

0.8965206852783371

In [105]:
pipeline.score(x_test,y_test)

0.8742290285016514

In [106]:
# Save the model
joblib.dump(pipeline, "poly_model.joblib")

['poly_model.joblib']

## Tained data on Linear Regression 

Encoding Train Data

In [107]:
encoder_1 = TargetEncoder(cols=["make"])
x_train["make"] = encoder_1.fit_transform(x_train["make"], y_train)

In [108]:
encoder_2 = TargetEncoder(cols=["model"])
x_train["model"] = encoder_2.fit_transform(x_train["model"], y_train)

In [109]:
x_train.head()

Unnamed: 0,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark_Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
11519,2012.0,120000,12.99,2494.0,100.6,11.0,10.588717,8.339101,1,0,1,0,0,0,1,0,1
10241,2009.0,50000,17.3,1061.0,57.5,14.0,4.657152,4.761434,1,0,0,0,1,0,1,1,0
15309,2015.0,55000,18.19,1197.0,103.2,8.0,5.478849,6.607383,0,0,0,0,0,1,0,1,0
12025,2013.0,57001,20.77,1248.0,88.76,10.0,4.657152,6.339168,0,0,1,0,0,0,1,0,1
2468,2017.0,66000,31.79,998.0,58.2,6.0,4.657152,6.311732,1,0,0,0,0,0,1,1,0


Scaling Train Data

In [110]:
from sklearn.preprocessing import StandardScaler

In [111]:
scaler = StandardScaler()

In [112]:
scaled_train = scaler.fit_transform(x_train)
scaled_train

array([[-0.7965532 ,  1.18108809, -1.53799669, ...,  0.49800863,
        -2.24354308,  2.35628227],
       [-1.73239697, -0.15702706, -0.51714229, ...,  0.49800863,
         0.44572356, -0.42439737],
       [ 0.13929057, -0.0614474 , -0.30633941, ..., -2.00799734,
         0.44572356, -0.42439737],
       ...,
       [-0.7965532 ,  0.41645086, -0.04342795, ...,  0.49800863,
         0.44572356, -0.42439737],
       [ 0.13929057, -0.34818637,  1.05795789, ...,  0.49800863,
         0.44572356, -0.42439737],
       [ 1.38708227, -0.82608463, -0.31581369, ...,  0.49800863,
        -2.24354308,  2.35628227]], shape=(15856, 17))

In [113]:
x_train[x_train.columns] = scaled_train
x_train.sample(1)

Unnamed: 0,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark_Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
5425,-0.484605,0.034132,0.619772,-0.441043,-0.473935,0.484605,-1.08748,-0.235714,1.242365,-0.097727,1.016406,-0.015885,-0.059534,-0.977178,0.498009,0.445724,-0.424397


Modeling Data by Linear Regression

In [114]:
from sklearn.linear_model import LinearRegression

In [115]:
model = LinearRegression()

In [116]:
model.fit(x_train, y_train)

Encoding Test Data

In [117]:
x_test["make"] = encoder_1.transform(x_test["make"], y_test)
x_test.head(2)

Unnamed: 0,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark_Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
15946,2014.0,40000,18.9,1197.0,82.0,9.0,5.449941,GRAND I10 ASTA OPTION,0,0,0,0,0,1,1,1,0
18151,2019.0,7121,17.21,1197.0,103.5,4.0,5.478849,POLO 2015-2019 GT TSI,0,0,0,0,0,1,0,1,0


In [118]:
x_test["model"] = encoder_2.transform(x_test["model"], y_test)
x_test.head(2)

Unnamed: 0,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark_Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
15946,2014.0,40000,18.9,1197.0,82.0,9.0,5.449941,5.296543,0,0,0,0,0,1,1,1,0
18151,2019.0,7121,17.21,1197.0,103.5,4.0,5.478849,6.835641,0,0,0,0,0,1,0,1,0


Scaling Test Data

In [119]:
scaled_test = scaler.transform(x_test)
scaled_test

array([[-0.17265735, -0.34818637, -0.13817082, ...,  0.49800863,
         0.44572356, -0.42439737],
       [ 1.38708227, -0.97669905, -0.53845943, ..., -2.00799734,
         0.44572356, -0.42439737],
       [ 1.38708227, -0.8069687 ,  1.0319036 , ...,  0.49800863,
         0.44572356, -0.42439737],
       ...,
       [ 0.13929057,  0.05324818,  1.49614366, ...,  0.49800863,
         0.44572356, -0.42439737],
       [ 0.45123849,  0.11059597, -0.96717091, ...,  0.49800863,
        -2.24354308,  2.35628227],
       [-0.48460528,  0.1297119 , -0.09079938, ...,  0.49800863,
         0.44572356, -0.42439737]], shape=(3964, 17))

In [120]:
x_test[x_test.columns] = scaled_test

In [121]:
y_pred = model.predict(x_test)
y_pred

array([4.31100752, 9.4223334 , 5.76399655, ..., 5.3044308 , 9.69850734,
       2.89863071], shape=(3964,))

R<sup>2</sup> Score

In [122]:
model.score(x_train, y_train)

0.8534424484109444

In [123]:
model.score(x_test, y_test)

0.8457839405948762

### <b>Observation</b> : 2 degree polynomial regression is used instead of Linear Regression because poly provided good accuracy on both train and test data.