# Importing Dataset and Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
plt.rcParams['figure.figsize']=12,8

In [2]:
data=pd.read_csv('cleaned.csv')


In [3]:
data.head()

Unnamed: 0,symboling,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


# Splitting Dataset into Numerical and Categorical

In [4]:
num=data.select_dtypes(include=np.number)
cat=data.select_dtypes(include=['object'])


In [5]:
num.columns

Index(['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight',
       'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower',
       'peak-rpm', 'city-mpg', 'highway-mpg', 'price'],
      dtype='object')

In [6]:
cat.columns

Index(['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
       'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders',
       'fuel-system'],
      dtype='object')

# Encoding Categorical Data

To check for the number unique values in the categorical dataframe

In [7]:
for col in cat.columns:
    print (col, ':', len(cat[col].unique()))

make : 21
fuel-type : 2
aspiration : 2
num-of-doors : 2
body-style : 5
drive-wheels : 3
engine-location : 2
engine-type : 5
num-of-cylinders : 6
fuel-system : 7


Encoding the categorical variable

In [8]:
cat.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [9]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
oe = OneHotEncoder()
le=LabelEncoder()
for i in list(data.columns):
    if data[i].dtype=='object':
        data[i]=le.fit_transform(data[i])       
    

In [10]:
data.head()

Unnamed: 0,symboling,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,0,1,0,1,0,2,0,88.6,168.8,...,130,4,3.47,2.68,9.0,111,5000,21,27,13495
1,3,0,1,0,1,0,2,0,88.6,168.8,...,130,4,3.47,2.68,9.0,111,5000,21,27,16500
2,1,0,1,0,1,2,2,0,94.5,171.2,...,152,4,2.68,3.47,9.0,154,5000,19,26,16500
3,2,1,1,0,0,3,1,0,99.8,176.6,...,109,4,3.19,3.4,10.0,102,5500,24,30,13950
4,2,1,1,0,0,3,0,0,99.4,176.6,...,136,4,3.19,3.4,8.0,115,5500,18,22,17450


Splitting data into X and y variables

In [11]:
y=data['price']

In [12]:
X=data.drop('price', axis=1)

Feature importance

In [13]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

In [14]:
model=SelectFromModel(Lasso(alpha=0.1,random_state=0))

In [15]:
model.fit(X,y)

SelectFromModel(estimator=Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False),
        norm_order=1, prefit=False, threshold=None)

In [16]:
model.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

To check the columns selected by our model

In [17]:
cols=X.columns
selected_columns=cols[model.get_support()]

In [18]:
selected_columns

Index(['symboling', 'make', 'fuel-type', 'aspiration', 'num-of-doors',
       'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length',
       'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',
       'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio',
       'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg'],
      dtype='object')

According to our Feature selection model, all our features are important to build the training model.

# Regression Models to Predict Automobile price

We would train our model using 4 different algorithms

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [49]:
models=[]
models.append(('Linear Regression', LinearRegression()))
models.append(('Decision Tree', DecisionTreeRegressor()))
models.append(('Random Forest', RandomForestRegressor()))
models.append(('KNN',KNeighborsRegressor()))



In [51]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score

In [24]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [50]:
for name, model in models:
    print(name)
    model.fit(X_train,y_train)
    prediction=model.predict(X_test)
    print(f'The R2 score % of {name} is:',round(r2_score(y_test,prediction)*100,2))
    print('\n') 


Linear Regression
The R2 score % of Linear Regression is: 87.21


Decision Tree
The R2 score % of Decision Tree is: 89.66


Random Forest
The R2 score % of Random Forest is: 96.03


KNN
The R2 score % of KNN is: 87.71




Random forest model yielded the hightest r2 score of 96.03%