In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

#This script is implemented using Jupyter notebook. 
#Therefore, to draw graphs within the notebook, we have used the command %matplotlib inline. If you are using Spyder, you can remove the last line.

 

In [2]:
petrol_data = pd.read_csv("~/Desktop/Datasets/petrol_data.csv")
#this script reads the dataset and stores it in petrol_datadataframe

In [3]:
petrol_data.head()

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [4]:
# the following script divides the data into feature and label set. 
features = petrol_data.iloc[:,0:4].values
labels =petrol_data.iloc[:,4].values

In [5]:
#The next thing is to divide the data into 80% training and 20% test sets:
from sklearn.model_selection import train_test_split
train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size = 0.2,random_state = 0)

In [6]:
#If you look at the dataset you can see that it is not scaled properly.The feature Population_Driver_License has values between
#0 and 1 while Average_Income and Paved_Highways has values in thousands.
#Before feeding our data to the algorithm, we need to scale our features.
#To do that we run the following script
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
train_features_poly = feature_scaler.fit_transform(train_features)
test_features_poly = feature_scaler.transform(test_features)

In [29]:
train_features_poly

array([[-1.32525691e-01,  1.50577399e+00,  2.25854636e+00,
        -9.15791057e-01],
       [-6.28192647e-01, -1.83367022e+00,  2.12333843e-01,
        -1.12852703e+00],
       [-1.32525691e-01, -1.78897047e+00, -4.54194551e-01,
        -5.25775100e-01],
       [-1.32525691e-01, -8.78212954e-01,  8.77245103e-01,
         4.15208375e-02],
       [-6.28192647e-01,  1.27482526e+00,  1.07480487e+00,
         2.89712810e-01],
       [ 1.35447518e+00, -8.42825648e-01, -1.13743332e+00,
         5.92488356e-02],
       [-6.28192647e-01, -4.00925436e-02, -4.64436395e-01,
         8.21552752e-01],
       [-6.28192647e-01,  7.42153173e-01,  2.93282707e-02,
         2.61208055e+00],
       [-6.28192647e-01,  8.82232011e-04,  1.22196399e+00,
         1.65616824e-01],
       [ 3.63141265e-01, -2.41241442e-01,  4.54996025e-02,
        -2.42127131e-01],
       [ 1.35447518e+00,  1.07926383e+00, -9.04835665e-01,
        -1.16398303e+00],
       [-6.28192647e-01, -2.07716626e-01,  7.28199328e-01,
      

In [7]:
test_features_poly

array([[ 1.35447518, -1.33452296, -0.31161731, -1.48308699],
       [ 0.36314126,  0.15174391, -1.44873146, -0.57895909],
       [ 0.36314126, -1.6194839 , -0.10974518, -0.5080471 ],
       [-0.62819265, -1.26188585,  0.29615525, -1.03988704],
       [ 0.36314126, -2.33654247,  0.19346729,  0.02379284],
       [-0.62819265, -0.78322597,  0.1560037 ,  0.16561682],
       [ 0.36314126, -1.47048471, -0.62291545, -1.58945498],
       [-0.62819265,  0.2448684 , -0.31458205,  0.87473675],
       [ 0.36314126,  1.50577399, -0.98865707, -0.41940711],
       [ 0.36314126,  0.13684399,  0.0357968 , -0.82715107]])

In [8]:
#We have scaled our features down, now it is time to train the algorithm. 
from sklearn.tree import DecisionTreeClassifier
dt_reg = DecisionTreeClassifier()
dt_reg.fit(train_features,train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [9]:
predictions = dt_reg.predict(test_features)

In [10]:
predictions

array([591, 540, 574, 547, 574, 508, 591, 968, 460, 464])

In [12]:
comparison=pd.DataFrame({'Predictions':predictions, 'Real':test_labels, })

In [13]:
print(comparison)

Predictions  Real
0          591   534
1          540   410
2          574   577
3          547   571
4          574   577
5          508   704
6          591   487
7          968   587
8          460   467
9          464   580


In [14]:
test_labels

array([534, 410, 577, 571, 577, 704, 487, 587, 467, 580])

In [15]:
from sklearn import metrics 

In [16]:
print('MAE:', metrics.mean_absolute_error(test_labels, predictions))
print('MSE:', metrics.mean_squared_error(test_labels, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(test_labels, predictions)))

MAE: 102.1
MSE: 22864.1
RMSE: 151.20879604044202


In [17]:
#The value of MAE is 79, which is greater than MAE value calculated for Polynomial Regression algorithm in the last chapter. Similarly, the value of RMSE in the case of decision tree is 118 which
#MAE:84.2
#MSE:18823.4
#RMSE: 137.19