# Importing libararies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Loading the dataset

In [2]:
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Vehicle Size,Vehicle Style,highway MPG,city mpg,MSRP
0,2011,335.0,6.0,0,0,2.0,0,0,26,19,46135
1,2011,300.0,6.0,0,0,2.0,0,1,28,19,40650
2,2011,300.0,6.0,0,0,2.0,0,0,28,20,36350
3,2011,230.0,6.0,0,0,2.0,0,0,28,18,29450
4,2011,230.0,6.0,0,0,2.0,0,1,28,18,34500


# Dataset information

### 1) Displaying rows and columns 

In [3]:
data.shape

(11914, 11)

### 2) Displaying the column names

In [4]:
data.columns

Index(['Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type',
       'Driven_Wheels', 'Number of Doors', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'MSRP'],
      dtype='object')

### 3) Displaying data type of each column  

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               11914 non-null  int64  
 1   Engine HP          11845 non-null  float64
 2   Engine Cylinders   11884 non-null  float64
 3   Transmission Type  11914 non-null  int64  
 4   Driven_Wheels      11914 non-null  int64  
 5   Number of Doors    11908 non-null  float64
 6   Vehicle Size       11914 non-null  int64  
 7   Vehicle Style      11914 non-null  int64  
 8   highway MPG        11914 non-null  int64  
 9   city mpg           11914 non-null  int64  
 10  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(8)
memory usage: 1024.0 KB


### 4) displaying the number of null values

In [6]:
data.isnull().sum()

Year                  0
Engine HP            69
Engine Cylinders     30
Transmission Type     0
Driven_Wheels         0
Number of Doors       6
Vehicle Size          0
Vehicle Style         0
highway MPG           0
city mpg              0
MSRP                  0
dtype: int64

### Dropping Null Values

In [7]:
data=data.dropna() #drop rows with atleast a column with missing values


In [8]:
data.isnull().sum()

Year                 0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      0
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
MSRP                 0
dtype: int64

### 5) Some stats on the dataset

In [9]:
data.describe()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Vehicle Size,Vehicle Style,highway MPG,city mpg,MSRP
count,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0,11815.0
mean,2010.359966,249.480491,5.649513,0.858146,1.152433,3.43267,0.834109,4.570631,26.320609,19.32755,40554.37
std,7.594359,109.203463,1.751433,0.664606,0.965569,0.882853,0.776913,3.467384,7.442674,6.52701,60277.5
min,1990.0,55.0,0.0,0.0,0.0,2.0,0.0,0.0,12.0,7.0,2000.0
25%,2007.0,170.0,4.0,1.0,0.0,2.0,0.0,2.0,22.0,16.0,20990.0
50%,2015.0,227.0,6.0,1.0,1.0,4.0,1.0,4.0,26.0,18.0,29960.0
75%,2016.0,300.0,6.0,1.0,2.0,4.0,1.0,6.0,30.0,22.0,42200.0
max,2017.0,1001.0,16.0,3.0,3.0,4.0,2.0,15.0,354.0,137.0,2065902.0


### 6) Checking for duplicate values

In [10]:
print(data[data.duplicated()])

       Year  Engine HP  Engine Cylinders  Transmission Type  Driven_Wheels  \
14     2013      230.0               6.0                  0              0   
18     1992      172.0               6.0                  0              1   
20     1992      172.0               6.0                  0              1   
24     1993      172.0               6.0                  0              1   
25     1993      172.0               6.0                  0              1   
...     ...        ...               ...                ...            ...   
11481  1998       95.0               4.0                  0              3   
11603  2017      302.0               4.0                  1              2   
11604  2017      240.0               4.0                  1              1   
11708  2008      252.0               6.0                  1              2   
11717  2008      252.0               6.0                  1              1   

       Number of Doors  Vehicle Size  Vehicle Style  highway MP

### 7.) Checking the correlation of attributes with each other

In [11]:
data.corr()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Vehicle Size,Vehicle Style,highway MPG,city mpg,MSRP
Year,1.0,0.35298,-0.034494,0.316264,0.098368,0.262265,0.086846,-0.063135,0.28021,0.232439,0.227381
Engine HP,0.35298,1.0,0.77974,0.239824,-0.010711,-0.10207,0.397373,-0.12747,-0.414183,-0.465903,0.661833
Engine Cylinders,-0.034494,0.77974,1.0,0.109557,-0.02982,-0.137584,0.485464,0.087225,-0.620316,-0.637879,0.543971
Transmission Type,0.316264,0.239824,0.109557,1.0,0.066674,0.177031,0.140977,-0.053247,0.052832,0.044509,0.24188
Driven_Wheels,0.098368,-0.010711,-0.02982,0.066674,1.0,0.23501,0.082344,0.193519,-0.090888,-0.042024,-0.031782
Number of Doors,0.262265,-0.10207,-0.137584,0.177031,0.23501,1.0,0.246079,0.154347,0.120925,0.136574,-0.127367
Vehicle Size,0.086846,0.397373,0.485464,0.140977,0.082344,0.246079,1.0,0.194891,-0.326246,-0.346916,0.117891
Vehicle Style,-0.063135,-0.12747,0.087225,-0.053247,0.193519,0.154347,0.194891,1.0,-0.366084,-0.23367,-0.206249
highway MPG,0.28021,-0.414183,-0.620316,0.052832,-0.090888,0.120925,-0.326246,-0.366084,1.0,0.847022,-0.198942
city mpg,0.232439,-0.465903,-0.637879,0.044509,-0.042024,0.136574,-0.346916,-0.23367,0.847022,1.0,-0.225277


# Aplying Machine Learning Techniques

In [12]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

### Splitting data into train and test set

In [13]:
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)

### Feature scaling

In [15]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### Logistic Regression

In [16]:
# lr = LogisticRegression()
# lr.fit(x_train, y_train)

### Decision tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)

In [20]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [21]:
dt = DecisionTreeClassifier()
dt = dt.fit(x_train, y_train)

In [22]:
y_pred = dt.predict(x_test)

In [23]:
metrics.accuracy_score(y_test, y_pred)

0.12780363944138806

### Linear regression

In [24]:
from sklearn.linear_model import LinearRegression
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression()

In [25]:
y_pred = regressor.predict(x_test)

#### checking accuacy of Linear regression

In [26]:
from sklearn.metrics import mean_squared_error, r2_score
r2score = r2_score(y_test, y_pred)

In [27]:
print(r2score)

0.45048225753653504


## Polynomial regression

In [28]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.model_selection import train_test_split
# x = data.iloc[:, :-1].values
# y = data.iloc[:,-1].values
# x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
# polynomial_features = PolynomialFeatures(degree = 3)

In [29]:
# x_poly = polynomial_features.fit_transform(x_train)

In [30]:
# regressor1 = LinearRegression()
# regressor1.fit(x_poly,y_train)

In [31]:
# y_pred = dt.predict(x_test)
# from sklearn.metrics import mean_squared_error, r2_score
# r2score = r2_score(y_test, y_pred)
# print(r2score)

## kNN

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [33]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)

In [34]:
classifier.fit(x_train, y_train)
KNeighborsClassifier()

KNeighborsClassifier()

In [35]:
y_pred = classifier.predict(sc.transform(x_test))

### accuracy

In [36]:
from sklearn.metrics import accuracy_score

In [37]:
accuracy_score(y_test, y_pred)

0.09352517985611511

## SVM

In [38]:
# from sklearn.svm import SVC

In [39]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# x = data.iloc[:, :-1].values
# y = data.iloc[:,-1].values
# x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
# sc = StandardScaler()
# x_train = sc.fit_transform(x_train)
# x_test = sc.transform(x_test)
# classifier = SVC(kernel = 'linear', random_state = 0)
# classifier.fit(x_train, y_train)

In [40]:
# y_pred = classifier.predict(x_test)

## RANDOM FOREST

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
x = data.iloc[:, :-1].values
y = data.iloc[:,-1].values
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2,random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
classifier = RandomForestClassifier(n_estimators = 10 , criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [42]:
metrics.accuracy_score(y_test, y_pred)

0.09352517985611511