In [4]:
%cd "/content/drive/MyDrive/Predicting_CO2_emission_by_vehicles"

/content/drive/MyDrive/Predicting_CO2_emission_by_vehicles


# ***Dataset Description***

Model
- 4WD/4X4 = Four-wheel drive
- AWD = All-wheel drive
- FFV = Flexible-fuel vehicle
- SWB = Short wheelbase
- LWB = Long wheelbase
- EWB = Extended wheelbase

Transmission
- A = Automatic
- AM = Automated manual
- AS = Automatic with select shift
- AV = Continuously variable
- M = Manual
- 3 - 10 = Number of gears

Fuel type
- X = Regular gasoline
- Z = Premium gasoline
- D = Diesel
- E = Ethanol (E85)
- N = Natural gas

**Fuel Consumption**

City and highway fuel consumption ratings are shown in litres per 100 kilometres (L/100 km) - the combined rating (55% city, 45% hwy) is shown in L/100 km and in miles per gallon (mpg)

**CO2 Emissions**

The tailpipe emissions of carbon dioxide (in grams per kilometre) for combined city and highway driving

# Importing Libraries

In [19]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
pd.options.plotting.backend = "plotly"

import textwrap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, f1_score, recall_score, roc_auc_score, classification_report

# Loading Dataset

In [5]:
df = pd.read_csv('CO2_Emissions_Canada.csv')
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


## Basic Data Exploration

In [6]:
df.shape

(7385, 12)

In [8]:
df.dtypes

Make                                 object
Model                                object
Vehicle Class                        object
Engine Size(L)                      float64
Cylinders                             int64
Transmission                         object
Fuel Type                            object
Fuel Consumption City (L/100 km)    float64
Fuel Consumption Hwy (L/100 km)     float64
Fuel Consumption Comb (L/100 km)    float64
Fuel Consumption Comb (mpg)           int64
CO2 Emissions(g/km)                   int64
dtype: object

In [9]:
df.describe()

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
count,7385.0,7385.0,7385.0,7385.0,7385.0,7385.0,7385.0
mean,3.160068,5.61503,12.556534,9.041706,10.975071,27.481652,250.584699
std,1.35417,1.828307,3.500274,2.224456,2.892506,7.231879,58.512679
min,0.9,3.0,4.2,4.0,4.1,11.0,96.0
25%,2.0,4.0,10.1,7.5,8.9,22.0,208.0
50%,3.0,6.0,12.1,8.7,10.6,27.0,246.0
75%,3.7,6.0,14.6,10.2,12.6,32.0,288.0
max,8.4,16.0,30.6,20.6,26.1,69.0,522.0


In [7]:
df.isna().sum()

Make                                0
Model                               0
Vehicle Class                       0
Engine Size(L)                      0
Cylinders                           0
Transmission                        0
Fuel Type                           0
Fuel Consumption City (L/100 km)    0
Fuel Consumption Hwy (L/100 km)     0
Fuel Consumption Comb (L/100 km)    0
Fuel Consumption Comb (mpg)         0
CO2 Emissions(g/km)                 0
dtype: int64

In [10]:
numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in df.columns if df[cname].dtype == "object"]

print('Numerical Columns: ', numerical_cols)
print('Categorical Columns: ', categorical_cols)

Numerical Columns:  ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)', 'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)', 'Fuel Consumption Comb (mpg)', 'CO2 Emissions(g/km)']
Categorical Columns:  ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']


In [11]:
df['Cylinders'].value_counts()

4     3220
6     2446
8     1402
12     151
3       95
10      42
5       26
16       3
Name: Cylinders, dtype: int64

**Transmission**
- A = Automatic
- AM = Automated manual
- AS = Automatic with select shift
- AV = Continuously variable
- M = Manual
- 3 - 10 = Number of gears

In [12]:
df['Transmission'].value_counts()

AS6     1324
AS8     1211
M6       901
A6       789
A8       490
AM7      445
A9       339
AS7      319
AV       295
M5       193
AS10     168
AM6      132
AV7      118
AV6      113
M7        91
A5        84
AS9       77
A4        65
AM8       62
A7        53
AV8       39
A10       31
AS5       26
AV10      11
AM5        4
AM9        3
AS4        2
Name: Transmission, dtype: int64

**Observation**

- We can seperate the gears numbers to make a seperate feature.

In [15]:
df.Make.value_counts()

FORD             628
CHEVROLET        588
BMW              527
MERCEDES-BENZ    419
PORSCHE          376
TOYOTA           330
GMC              328
AUDI             286
NISSAN           259
JEEP             251
DODGE            246
KIA              231
HONDA            214
HYUNDAI          210
MINI             204
VOLKSWAGEN       197
MAZDA            180
LEXUS            178
JAGUAR           160
CADILLAC         158
SUBARU           140
VOLVO            124
INFINITI         108
BUICK            103
RAM               97
LINCOLN           96
MITSUBISHI        95
CHRYSLER          88
LAND ROVER        85
FIAT              73
ACURA             72
MASERATI          61
ROLLS-ROYCE       50
ASTON MARTIN      47
BENTLEY           46
LAMBORGHINI       41
ALFA ROMEO        30
GENESIS           25
SCION             22
SMART              7
BUGATTI            3
SRT                2
Name: Make, dtype: int64

In [16]:
df.Model.value_counts()

F-150 FFV 4X4           32
F-150 FFV               32
MUSTANG                 27
FOCUS FFV               24
SONIC                   20
                        ..
AVENTADOR S ROADSTER     1
HURACAN AWD              1
HURACAN SPYDER AWD       1
LS 500                   1
XC40 T4 AWD              1
Name: Model, Length: 2053, dtype: int64

**Fuel Type**

- **X** = Regular gasoline
- **Z** = Premium gasoline
- **D** = Diesel
- **E** = Ethanol (E85)
- **N** = Natural gas

In [21]:
df['Fuel Type'].value_counts()

X    3637
Z    3202
E     370
D     175
N       1
Name: Fuel Type, dtype: int64

***Observations***

- Most of the object columns has more than 10 unique values. So, we can't use One Hot Encoding which will increase the dimension of this data.

## Data Viz

In [59]:
fig = make_subplots(rows=2, cols=4, subplot_titles=df[numerical_cols].columns)

fig.add_trace(go.Histogram(x=df['Engine Size(L)'], name='Engine Size(L)'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['Cylinders'], name='Cylinders'), row=1, col=2)
fig.add_trace(go.Histogram(x=df['Fuel Consumption City (L/100 km)'], name='Fuel Consumption City (L/100 km)'), row=1, col=3)
fig.add_trace(go.Histogram(x=df['Fuel Consumption Hwy (L/100 km)'], name='Fuel Consumption Hwy (L/100 km)'), row=1, col=4)
fig.add_trace(go.Histogram(x=df['Fuel Consumption Comb (L/100 km)'], name='Fuel Consumption Comb (L/100 km)'), row=2, col=1)
fig.add_trace(go.Histogram(x=df['Fuel Consumption Comb (mpg)'], name='Fuel Consumption Comb (mpg)'), row=2, col=2)
fig.add_trace(go.Histogram(x=df['CO2 Emissions(g/km)'], name='CO2 Emissions(g/km)'), row=2, col=3)

# Update layout
fig.update_layout(title='Individual Displots of Seven Columns', showlegend=False)
fig.update_xaxes(title_text='Value', row=2, col=1)
fig.update_yaxes(title_text='Probability Density', row=1, col=1)

# Show plot
fig.show()


In [32]:
fig = make_subplots(rows=2, cols=4, subplot_titles=df[numerical_cols].columns)

fig.add_trace(go.Box(x=df['Engine Size(L)'], name='Engine Size(L)'), row=1, col=1)
fig.add_trace(go.Box(x=df['Cylinders'], name='Cylinders'), row=1, col=2)
fig.add_trace(go.Box(x=df['Fuel Consumption City (L/100 km)'], name='FC City (L/100 km)'), row=1, col=3)
fig.add_trace(go.Box(x=df['Fuel Consumption Hwy (L/100 km)'], name='FC Hwy (L/100 km)'), row=1, col=4)
fig.add_trace(go.Box(x=df['Fuel Consumption Comb (L/100 km)'], name='FC Comb (L/100 km)'), row=2, col=1)
fig.add_trace(go.Box(x=df['Fuel Consumption Comb (mpg)'], name='FC Comb (mpg)'), row=2, col=2)
fig.add_trace(go.Box(x=df['CO2 Emissions(g/km)'], name='CO2 Emissions(g/km)'), row=2, col=3)

# Update layout
fig.update_layout(title='Individual Box Plots of Seven Columns', showlegend=False)

# Rotate y-axis labels
fig.update_yaxes(tickangle=90)

In [29]:
counts = df['Fuel Type'].value_counts()

labels = ['Regular gasoline', 'Premium gasoline', 'Ethanol (E85)', 'Diesel', 'Natural Gas']

fig = px.pie(values=counts, names=labels, title='Distribution of Fuel Types')
fig.show()

In [52]:
scat = ['City L/100 km', 'Hwy L/100 km', 'Comb L/100 km', 'Comb mpg', 'CO2 g/km']

fig = px.scatter_matrix(df,
    dimensions=['Fuel Consumption City (L/100 km)',
                'Fuel Consumption Hwy (L/100 km)',
                'Fuel Consumption Comb (L/100 km)',
                'Fuel Consumption Comb (mpg)',
                'CO2 Emissions(g/km)'],
    title="Scatter matrix of Fuel Consumption",
    labels=dict(zip(['Fuel Consumption City (L/100 km)',
                     'Fuel Consumption Hwy (L/100 km)',
                     'Fuel Consumption Comb (L/100 km)',
                     'Fuel Consumption Comb (mpg)',
                     'CO2 Emissions(g/km)'], scat))
)
fig.update_traces(diagonal_visible=False)
fig.update_layout(height=800)
fig.update_traces(diagonal_visible=False)
fig.show()

In [53]:
corr_matrix = df[numerical_cols].corr().round(2)

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.columns.tolist(),
    colorscale='Viridis',
    showscale=True  # Show color scale
)

fig.update_layout(
    title='Pearson Correlation Heatmap',
    xaxis=dict(title='Features'),
    yaxis=dict(title='Features'),
    font=dict(size=10),  # Adjust font size
    margin=dict(t=100, r=100),  # Adjust margins to prevent overlapping
)

fig.show()
