In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import style
style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')



In [2]:
df = pd.read_csv('CO2_Emissions_Canada.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7385 non-null   object 
 1   Model                             7385 non-null   object 
 2   Vehicle Class                     7385 non-null   object 
 3   Engine Size(L)                    7385 non-null   float64
 4   Cylinders                         7385 non-null   int64  
 5   Transmission                      7385 non-null   object 
 6   Fuel Type                         7385 non-null   object 
 7   Fuel Consumption City (L/100 km)  7385 non-null   float64
 8   Fuel Consumption Hwy (L/100 km)   7385 non-null   float64
 9   Fuel Consumption Comb (L/100 km)  7385 non-null   float64
 10  Fuel Consumption Comb (mpg)       7385 non-null   int64  
 11  CO2 Emissions(g/km)               7385 non-null   int64  
dtypes: flo

In [4]:
df.duplicated().sum()

1103

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
cols = ['Vehicle Class', 'Fuel Type', 'Transmission', 'Cylinders', 'Engine Size(L)', 'Fuel Consumption Comb (L/100 km)']
X = df[cols]
Y = df[['CO2 Emissions(g/km)']]

In [19]:
print('Independent Feature set shape : ', X.shape)
print('Dependent Feature shape       : ', Y.shape)

Independent Feature set shape :  (6282, 6)
Dependent Feature shape       :  (6282, 1)


In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [23]:
print('Training data shape   : ', x_train.shape)
print('Training labels shape : ', y_train.shape)
print('Testing data shape    : ', x_test.shape)
print('Testing labels shape  : ', y_test.shape )

Training data shape   :  (5025, 6)
Training labels shape :  (5025, 1)
Testing data shape    :  (1257, 6)
Testing labels shape  :  (1257, 1)


In [25]:
# Scaling 
cols = ['Engine Size(L)', 'Fuel Consumption Comb (L/100 km)']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train[cols])
x_train[cols] = scaler.transform(x_train[cols])
x_test[cols]  = scaler.transform(x_test[cols])

In [27]:
# Encoding
cols = ['Vehicle Class', 'Fuel Type', 'Transmission']
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
x_train[cols] = encoder.fit_transform(x_train[cols])
x_test[cols] = encoder.transform(x_test[cols])

In [29]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [31]:
pred_test  = regressor.predict(x_test)

In [33]:
pred_test

array([[185.00420288],
       [227.93350005],
       [195.56259913],
       ...,
       [253.04582965],
       [205.29434029],
       [290.90034832]])

In [35]:
regressor.score(x_test,y_test)

0.8956723382374651

In [39]:
df

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...
7380,VOLVO,XC40 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,10.7,7.7,9.4,30,219
7381,VOLVO,XC60 T5 AWD,SUV - SMALL,2.0,4,AS8,Z,11.2,8.3,9.9,29,232
7382,VOLVO,XC60 T6 AWD,SUV - SMALL,2.0,4,AS8,Z,11.7,8.6,10.3,27,240
7383,VOLVO,XC90 T5 AWD,SUV - STANDARD,2.0,4,AS8,Z,11.2,8.3,9.9,29,232


In [43]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(x_train)
X_poly

array([[ 1.        ,  1.        ,  3.        , ...,  0.12017778,
         0.12290909,  0.12570248],
       [ 1.        ,  0.        ,  4.        , ...,  0.17084444,
         0.13339394,  0.10415289],
       [ 1.        , 11.        ,  4.        , ...,  0.02151111,
         0.04066667,  0.07688017],
       ...,
       [ 1.        ,  2.        ,  3.        , ...,  0.02151111,
         0.028     ,  0.03644628],
       [ 1.        , 10.        ,  4.        , ...,  0.0784    ,
         0.09290909,  0.11010331],
       [ 1.        , 10.        ,  4.        , ...,  0.02151111,
         0.04066667,  0.07688017]])

In [55]:
regressor.fit(X_poly,y_train)

In [57]:
regressor.score(X_poly,y_train)

0.9793032462789657