## Importing the libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

## Reading the data

In [2]:
data = pd.read_csv("Car details v3.csv")
data.head()

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage (kmpl),engine (cc),max_power,torque,seats,selling_price
0,Maruti Swift Dzire VDI,2014,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74 bhp,190Nm@ 2000rpm,5.0,450000
1,Skoda Rapid 1.5 TDI Ambition,2014,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52 bhp,250Nm@ 1500-2500rpm,5.0,370000
2,Honda City 2017-2020 EXi,2006,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,158000
3,Hyundai i20 Sportz Diesel,2010,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90 bhp,22.4 kgm at 1750-2750rpm,5.0,225000
4,Maruti Swift VXI BSIII,2007,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,130000


In [3]:
data.describe(include = 'all')

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage (kmpl),engine (cc),max_power,torque,seats,selling_price
count,8128,8128.0,8128.0,8128,8128,8128,8128,7907.0,7907.0,7913,7906,7907.0,8128.0
unique,2058,,,4,3,2,5,381.0,,322,441,,
top,Maruti Swift Dzire VDI,,,Diesel,Individual,Manual,First Owner,18.9,,74 bhp,190Nm@ 2000rpm,,
freq,129,,,4402,6766,7078,5289,225.0,,377,530,,
mean,,2013.804011,69819.51,,,,,,1458.625016,,,5.416719,638271.8
std,,4.044249,56550.55,,,,,,503.916303,,,0.959588,806253.4
min,,1983.0,1.0,,,,,,624.0,,,2.0,29999.0
25%,,2011.0,35000.0,,,,,,1197.0,,,5.0,254999.0
50%,,2015.0,60000.0,,,,,,1248.0,,,5.0,450000.0
75%,,2017.0,98000.0,,,,,,1582.0,,,5.0,675000.0


## Data pre-processing

In [4]:
data = data.dropna() 
data = data.drop(['name','torque'], axis = 1)
data.head()

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,mileage (kmpl),engine (cc),max_power,seats,selling_price
0,2014,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74 bhp,5.0,450000
1,2014,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52 bhp,5.0,370000
2,2006,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78 bhp,5.0,158000
3,2010,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90 bhp,5.0,225000
4,2007,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2 bhp,5.0,130000


In [5]:
def cleaning(text):
    text = re.sub('[a-zA-Z]',' ', text)
    return(float(text))

In [6]:
data['max_power'] = data['max_power'].apply(cleaning)
data['max_power'].describe()

count    7906.000000
mean       91.587374
std        35.747216
min        32.800000
25%        68.050000
50%        82.000000
75%       102.000000
max       400.000000
Name: max_power, dtype: float64

In [7]:
data['mileage (kmpl)'] = data['mileage (kmpl)'].apply(cleaning)
data['mileage (kmpl)'].describe()

count    7906.000000
mean       19.419861
std         4.036263
min         0.000000
25%        16.780000
50%        19.300000
75%        22.320000
max        42.000000
Name: mileage (kmpl), dtype: float64

In [8]:
data = data.dropna(axis = 0)
data.describe(include = 'all')

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,mileage (kmpl),engine (cc),max_power,seats,selling_price
count,7906.0,7906.0,7906,7906,7906,7906,7906.0,7906.0,7906.0,7906.0,7906.0
unique,,,4,3,2,5,,,,,
top,,,Diesel,Individual,Manual,First Owner,,,,,
freq,,,4299,6563,6865,5215,,,,,
mean,2013.983936,69188.66,,,,,19.419861,1458.708829,91.587374,5.416393,649813.7
std,3.863695,56792.3,,,,,4.036263,503.893057,35.747216,0.959208,813582.7
min,1994.0,1.0,,,,,0.0,624.0,32.8,2.0,29999.0
25%,2012.0,35000.0,,,,,16.78,1197.0,68.05,5.0,270000.0
50%,2015.0,60000.0,,,,,19.3,1248.0,82.0,5.0,450000.0
75%,2017.0,95425.0,,,,,22.32,1582.0,102.0,5.0,690000.0


## Applying ML algorithms

In [9]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

CT = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [2,3,4,5])], remainder = 'passthrough')
X_train = CT.fit_transform(X_train)
X_test = CT.transform(X_test)

### Scaling the data

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

SS = StandardScaler()
X_train = SS.fit_transform(X_train)
X_test = SS.transform(X_test)

### Decision Tree model

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

DC = DecisionTreeRegressor()
DC.fit(X_train,y_train)
y_pred = DC.predict(X_test)
DCK = cross_val_score(DC, X_train, y_train, cv = 10)
print("Accuracy for Decision Tree Classifier: {:.2f} % ".format((DCK.mean())*100))
print("Standard Deviation for Decision Tree Classifier: {:.2f} %".format((DCK.std())*100))

Accuracy for Decision Tree Classifier: 94.74 % 
Standard Deviation for Decision Tree Classifier: 2.64 %


### Random Forest model

In [14]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor()
RF.fit(X_train,y_train)
y_pred = RF.predict(X_test)
RFK = cross_val_score(RF, X_train, y_train, cv = 10)
print("Accuracy for Random Forest Classifier: {:.2f} % ".format((RFK.mean())*100))
print("Standard Deviation for Random Forest Classifier: {:.2f} % ".format((RFK.std())*100))

Accuracy for Random Forest Classifier: 96.97 % 
Standard Deviation for Random Forest Classifier: 0.87 % 
