In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('petrol_consumption.csv')

In [3]:
df.head()

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [4]:
df.shape

(48, 5)

In [5]:
df.isnull().sum()

Petrol_tax                      0
Average_income                  0
Paved_Highways                  0
Population_Driver_licence(%)    0
Petrol_Consumption              0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.dtypes

Petrol_tax                      float64
Average_income                    int64
Paved_Highways                    int64
Population_Driver_licence(%)    float64
Petrol_Consumption                int64
dtype: object

In [47]:
corr=df.corr()
corr

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
Petrol_tax,1.0,0.012665,-0.52213,-0.288037,-0.45128
Average_income,0.012665,1.0,0.050163,0.15707,-0.244862
Paved_Highways,-0.52213,0.050163,1.0,-0.064129,0.019042
Population_Driver_licence(%),-0.288037,0.15707,-0.064129,1.0,0.698965
Petrol_Consumption,-0.45128,-0.244862,0.019042,0.698965,1.0


In [8]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(36, 4)
(12, 4)
(36,)
(12,)


#### Building the Model

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [28]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [40]:
def gen_metrics(y_test,ypred):
    mae=mean_absolute_error(y_test,ypred)
    mse=mean_squared_error(y_test,ypred)
    rmse=np.sqrt(mean_squared_error(y_test,ypred))
    r2s=r2_score(y_test,ypred)
    print('MAE',mae)
    print('MSE',mse)
    print('RMSE',rmse)
    print('R2_Score',r2s)

#### Linear Regression

In [54]:
m1=LinearRegression()
m1.fit(x_train,y_train)

LinearRegression()

In [55]:
#R2 Score
print('Training Score',m1.score(x_train,y_train))
print('Testing Score',m1.score(x_test,y_test))

Training Score 0.6796067660574714
Testing Score 0.612841419418205


In [56]:
ypred_m1=m1.predict(x_test)

In [57]:
gen_metrics(y_test,ypred_m1)

MAE 56.42633308378459
MSE 8477.729737454743
RMSE 92.07458790271474
R2_Score 0.612841419418205


#### Decision Tree Regression

In [58]:
m2=DecisionTreeRegressor(criterion='mse',max_depth=9,min_samples_split=10)
m2.fit(x_train,y_train)

DecisionTreeRegressor(max_depth=9, min_samples_split=10)

In [59]:
#R2 Score
print('Training Score',m2.score(x_train,y_train))
print('Testing Score',m2.score(x_test,y_test))

Training Score 0.7458212297336692
Testing Score 0.5070599703836383


In [60]:
ypred_m2=m2.predict(x_test)

In [61]:
gen_metrics(y_test,ypred_m2)

MAE 74.9513888888889
MSE 10794.058449074077
RMSE 103.8944582211875
R2_Score 0.5070599703836383


#### Random Forest Regression

In [62]:
m3=RandomForestRegressor(n_estimators=150,criterion='mse')
m3.fit(x_train,y_train)

RandomForestRegressor(n_estimators=150)

In [63]:
#R2 Score
print('Training Score',m3.score(x_train,y_train))
print('Testing Score',m3.score(x_test,y_test))

Training Score 0.9172277035741397
Testing Score 0.5061817913501112


In [64]:
ypred_m3=m3.predict(x_test)

In [65]:
gen_metrics(y_test,ypred_m3)

MAE 66.8627777777778
MSE 10813.288203703702
RMSE 103.98696170051177
R2_Score 0.5061817913501112
