In [1]:
import pandas as pd 
import numpy as np

# Importing the dataset

In [2]:
df=pd.read_csv("cardataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


# Checking the null values present in the dataset

In [4]:
df.isnull().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

As our datatset is of around 8.8k rows so we could drop the rows
with null values and drop the duplicate rows

In [5]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [6]:
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
mileage          0
engine           0
max_power        0
torque           0
seats            0
dtype: int64

In [7]:
df[["mileage","engine","max_power"]]

Unnamed: 0,mileage,engine,max_power
0,23.4 kmpl,1248 CC,74 bhp
1,21.14 kmpl,1498 CC,103.52 bhp
2,17.7 kmpl,1497 CC,78 bhp
3,23.0 kmpl,1396 CC,90 bhp
4,16.1 kmpl,1298 CC,88.2 bhp
...,...,...,...
8121,18.9 kmpl,998 CC,67.1 bhp
8122,22.54 kmpl,1396 CC,88.73 bhp
8123,18.5 kmpl,1197 CC,82.85 bhp
8124,16.8 kmpl,1493 CC,110 bhp


Mileage,Engine, max_power is of object type so but has some numerical values that could
be important for us. So we will remove the suffix and convert all the columns to numerical
column.

In [8]:
df['mileage/kmpl']=df.mileage.str.extract(r"(\d+\.\d+)")
df['engine/cc']=df.engine.str.extract('([-+]?\d*\.?\d+)')
df['max_power/bhp']=df.max_power.str.extract('([-+]?\d*\.?\d+)')

In [9]:
df["mileage/kmpl"]=df["mileage/kmpl"].astype(float)
df['engine/cc']=df['engine/cc'].astype(int)
df['max_power/bhp']=df['max_power/bhp'].astype(float)

Now Checking the correlation between rows 

In [10]:
df.corr()

Unnamed: 0,year,selling_price,km_driven,seats,mileage/kmpl,engine/cc,max_power/bhp
year,1.0,0.427335,-0.387918,0.025021,0.366048,-0.019763,0.159889
selling_price,0.427335,1.0,-0.161265,0.158531,-0.108655,0.442772,0.692323
km_driven,-0.387918,-0.161265,1.0,0.20789,-0.196419,0.25346,0.04177
seats,0.025021,0.158531,0.20789,1.0,-0.459188,0.658711,0.259028
mileage/kmpl,0.366048,-0.108655,-0.196419,-0.459188,1.0,-0.579153,-0.378609
engine/cc,-0.019763,0.442772,0.25346,0.658711,-0.579153,1.0,0.683506
max_power/bhp,0.159889,0.692323,0.04177,0.259028,-0.378609,0.683506,1.0


In [11]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=True)

Plotting the histogram to check the count of selling price in different range

In [12]:
fig = px.histogram(df,
x='selling_price',
marginal='box',
color="seller_type",
nbins=70,

title='selling price distribution')
fig.update_layout(bargap=0.1)
fig.show()

Plotting another histogram to see dataset distribution among different types of owners

In [13]:
fig = px.histogram(df,
x='owner',
marginal='box',

nbins=5,

title='vehicle sold by owner')
fig.update_layout(bargap=0.1)
fig.show()

plotting a  scatter plot to see how engine/cc vary with selling price

In [14]:
fig = px.scatter(df,
x='engine/cc',
y='selling_price',
color='year',
title='Engine vs Selling price')
fig.update_traces(marker_size=9)
fig.show()

plotting scatterplot to see how selling price vary with year

In [15]:
fig = px.scatter(df,
x='year',
y='selling_price',

opacity=0.8,

title='year vs selling price')
fig.update_traces(marker_size=9)
fig.show()

In the graph below we can see lesser km_driven is linked with high selling price. And if 
a car is listed by first owner and km_driven is minimum that mean selling price would be higher.

In [16]:
fig = px.scatter(df,
x='km_driven',
y='selling_price',
color='owner',

opacity=0.8,

title='km_driven vs selling price')
fig.update_traces(marker_size=9)
fig.update_layout(xaxis_range=[1,600000])
fig.show()

Now the dataset contains different categorical columns which could have some importance in prediction,
so we use oneHotEncoding to make these column into numerical columns.

In [17]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()


In [18]:
enc.fit(df[['fuel']])
enc.categories_


[array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object)]

In [19]:
one_hot = enc.transform(df[['fuel']]).toarray()
df[['CNG', 'Diesel', 'LPG', 'Petrol']] = one_hot


In [20]:
enc.fit(df[['seller_type']])
enc.categories_

[array(['Dealer', 'Individual', 'Trustmark Dealer'], dtype=object)]

In [21]:
one_hot = enc.transform(df[['seller_type']]).toarray()
df[['Dealer', 'Individual', 'Trustmark Dealer']] = one_hot



In [22]:
enc.fit(df[['owner']])
enc.categories_

[array(['First Owner', 'Fourth & Above Owner', 'Second Owner',
        'Test Drive Car', 'Third Owner'], dtype=object)]

In [23]:
one_hot = enc.transform(df[['owner']]).toarray()
df[['First Owner', 'Fourth & Above Owner', 'Second Owner','Test Drive Car', 'Third Owner']] = one_hot


In [24]:
transmission_value={'Manual':0, 'Automatic':1}

df["transmission"] =df.transmission.map(transmission_value)


In [25]:
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque',
       'seats', 'mileage/kmpl', 'engine/cc', 'max_power/bhp', 'CNG', 'Diesel',
       'LPG', 'Petrol', 'Dealer', 'Individual', 'Trustmark Dealer',
       'First Owner', 'Fourth & Above Owner', 'Second Owner', 'Test Drive Car',
       'Third Owner'],
      dtype='object')

In [26]:
df.corr()

Unnamed: 0,year,selling_price,km_driven,transmission,seats,mileage/kmpl,engine/cc,max_power/bhp,CNG,Diesel,LPG,Petrol,Dealer,Individual,Trustmark Dealer,First Owner,Fourth & Above Owner,Second Owner,Test Drive Car,Third Owner
year,1.0,0.427335,-0.387918,0.143997,0.025021,0.366048,-0.019763,0.159889,0.037774,0.053195,-0.057512,-0.051601,0.140091,-0.14725,0.046187,0.46727,-0.207548,-0.2851,0.037741,-0.261961
selling_price,0.427335,1.0,-0.161265,0.465538,0.158531,-0.108655,0.442772,0.692323,-0.034873,0.254024,-0.043617,-0.242279,0.26003,-0.260324,0.023266,0.230493,-0.086433,-0.158256,0.202151,-0.127216
km_driven,-0.387918,-0.161265,1.0,-0.118965,0.20789,-0.196419,0.25346,0.04177,-0.010304,0.279345,0.01906,-0.281045,-0.119168,0.125429,-0.040113,-0.264431,0.084031,0.181428,-0.027325,0.133485
transmission,0.143997,0.465538,-0.118965,1.0,-0.019314,-0.173667,0.219526,0.441681,-0.026763,-0.018312,-0.022144,0.026248,0.202953,-0.212855,0.064661,0.085057,-0.029304,-0.058737,0.069692,-0.047344
seats,0.025021,0.158531,0.20789,-0.019314,1.0,-0.459188,0.658711,0.259028,-0.040357,0.362669,-0.03195,-0.351944,-0.034041,0.038284,-0.023261,-0.010771,0.007749,0.011815,-0.012049,-0.003537
mileage/kmpl,0.366048,-0.108655,-0.196419,-0.173667,-0.459188,1.0,-0.579153,-0.378609,0.103233,0.073593,-0.015903,-0.089524,0.005474,-0.003511,-0.00898,0.177487,-0.08243,-0.103043,-0.017239,-0.103178
engine/cc,-0.019763,0.442772,0.25346,0.219526,0.658711,-0.579153,1.0,0.683506,-0.059767,0.537304,-0.058751,-0.519763,0.072777,-0.070498,-0.004838,-0.042198,0.003179,0.038802,0.016993,0.007979
max_power/bhp,0.159889,0.692323,0.04177,0.441681,0.259028,-0.378609,0.683506,1.0,-0.072703,0.329291,-0.064873,-0.308052,0.194103,-0.194538,0.018403,0.054209,-0.027113,-0.03288,0.06409,-0.035223
CNG,0.037774,-0.034873,-0.010304,-0.026763,-0.040357,0.103233,-0.059767,-0.072703,1.0,-0.09565,-0.00633,-0.077944,-0.029019,0.029667,-0.005557,0.004573,-0.00202,0.002537,-0.002387,-0.011464
Diesel,0.053195,0.254024,0.279345,-0.018312,0.362669,0.073593,0.537304,0.329291,-0.09565,1.0,-0.079143,-0.974455,0.033313,-0.021228,-0.055297,-0.034027,-0.008783,0.045765,-0.018885,-0.008577


defining the input and target columns

In [27]:

input_cols = [ 'year', 'km_driven', 'seats', 'mileage/kmpl', 'engine/cc', 
              'max_power/bhp', 'CNG', 'Diesel','LPG', 'Petrol', 'Dealer', 'Individual', 
              'Trustmark Dealer','First Owner', 'Fourth & Above Owner', 'Second Owner', 
              'Test Drive Car','Third Owner','transmission']
inputs, targets = df[input_cols], df['selling_price']



In [28]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))

In [29]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBRegressor

In [30]:
model=XGBRegressor()

In [31]:
X_train,X_test,Y_train,Y_test=train_test_split(inputs,targets,test_size=0.3,random_state=0)

In [32]:
model.fit(X_train,Y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [33]:
y_pred=model.predict(X_test)

Now we can see we have 165971.29 as our root mean square error which is not bad.

In [34]:
rmse(Y_test,y_pred)

165971.29005657262

and r2score of .9025 is also good.

In [35]:
print('r2score',metrics.r2_score(Y_test,y_pred))

r2score 0.9025535825180329


In [36]:
testout=pd.DataFrame({
    "actual_value":Y_test,
    "predicted_value":y_pred
})

In [37]:
testout

Unnamed: 0,actual_value,predicted_value
5162,350000,375600.78125
3707,180000,181121.71875
2146,330000,366323.90625
2955,486000,458051.37500
5387,375000,329448.40625
...,...,...
5666,620000,631726.00000
4797,810000,756023.50000
2538,490000,466688.25000
6378,390000,395243.15625
