# Install

In [1]:
%cd "/content/drive/MyDrive/Predicting_CO2_emission_by_vehicles"

/content/drive/MyDrive/Predicting_CO2_emission_by_vehicles


In [2]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m41.0/81.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


# **Dataset Description**

Model
- 4WD/4X4 = Four-wheel drive
- AWD = All-wheel drive
- FFV = Flexible-fuel vehicle
- SWB = Short wheelbase
- LWB = Long wheelbase
- EWB = Extended wheelbase

Transmission
- A = Automatic
- AM = Automated manual
- AS = Automatic with select shift
- AV = Continuously variable
- M = Manual
- 3 - 10 = Number of gears

Fuel type
- X = Regular gasoline
- Z = Premium gasoline
- D = Diesel
- E = Ethanol (E85)
- N = Natural gas

**Fuel Consumption**

City and highway fuel consumption ratings are shown in litres per 100 kilometres (L/100 km) - the combined rating (55% city, 45% hwy) is shown in L/100 km and in miles per gallon (mpg)

**CO2 Emissions**

The tailpipe emissions of carbon dioxide (in grams per kilometre) for combined city and highway driving

# Importing Libraries

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
pd.options.plotting.backend = "matplotlib"
import category_encoders as ce
from statsmodels.stats.outliers_influence import variance_inflation_factor

import textwrap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Loading Dataset

In [4]:
df = pd.read_csv('CO2_Emissions_Canada.csv')
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


# Basic Data Exploration

In [5]:
df.shape

(7385, 12)

In [6]:
df.dtypes

Make                                 object
Model                                object
Vehicle Class                        object
Engine Size(L)                      float64
Cylinders                             int64
Transmission                         object
Fuel Type                            object
Fuel Consumption City (L/100 km)    float64
Fuel Consumption Hwy (L/100 km)     float64
Fuel Consumption Comb (L/100 km)    float64
Fuel Consumption Comb (mpg)           int64
CO2 Emissions(g/km)                   int64
dtype: object

In [7]:
df.describe()

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
count,7385.0,7385.0,7385.0,7385.0,7385.0,7385.0,7385.0
mean,3.160068,5.61503,12.556534,9.041706,10.975071,27.481652,250.584699
std,1.35417,1.828307,3.500274,2.224456,2.892506,7.231879,58.512679
min,0.9,3.0,4.2,4.0,4.1,11.0,96.0
25%,2.0,4.0,10.1,7.5,8.9,22.0,208.0
50%,3.0,6.0,12.1,8.7,10.6,27.0,246.0
75%,3.7,6.0,14.6,10.2,12.6,32.0,288.0
max,8.4,16.0,30.6,20.6,26.1,69.0,522.0


In [8]:
df['CO2 Emissions(g/km)'].describe()

count    7385.000000
mean      250.584699
std        58.512679
min        96.000000
25%       208.000000
50%       246.000000
75%       288.000000
max       522.000000
Name: CO2 Emissions(g/km), dtype: float64

In [9]:
df.isna().sum()

Make                                0
Model                               0
Vehicle Class                       0
Engine Size(L)                      0
Cylinders                           0
Transmission                        0
Fuel Type                           0
Fuel Consumption City (L/100 km)    0
Fuel Consumption Hwy (L/100 km)     0
Fuel Consumption Comb (L/100 km)    0
Fuel Consumption Comb (mpg)         0
CO2 Emissions(g/km)                 0
dtype: int64

In [10]:
numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in df.columns if df[cname].dtype == "object"]

print('Numerical Columns: ', numerical_cols)
print('Categorical Columns: ', categorical_cols)

Numerical Columns:  ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)', 'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)', 'Fuel Consumption Comb (mpg)', 'CO2 Emissions(g/km)']
Categorical Columns:  ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']


In [11]:
df['Cylinders'].value_counts()

4     3220
6     2446
8     1402
12     151
3       95
10      42
5       26
16       3
Name: Cylinders, dtype: int64

**Transmission**
- A = Automatic
- AM = Automated manual
- AS = Automatic with select shift
- AV = Continuously variable
- M = Manual
- 3 - 10 = Number of gears

In [12]:
df['Transmission'].value_counts()

AS6     1324
AS8     1211
M6       901
A6       789
A8       490
AM7      445
A9       339
AS7      319
AV       295
M5       193
AS10     168
AM6      132
AV7      118
AV6      113
M7        91
A5        84
AS9       77
A4        65
AM8       62
A7        53
AV8       39
A10       31
AS5       26
AV10      11
AM5        4
AM9        3
AS4        2
Name: Transmission, dtype: int64

**Observation**

- We can seperate the gears numbers to make a seperate feature.

In [13]:
df.Make.value_counts()

FORD             628
CHEVROLET        588
BMW              527
MERCEDES-BENZ    419
PORSCHE          376
TOYOTA           330
GMC              328
AUDI             286
NISSAN           259
JEEP             251
DODGE            246
KIA              231
HONDA            214
HYUNDAI          210
MINI             204
VOLKSWAGEN       197
MAZDA            180
LEXUS            178
JAGUAR           160
CADILLAC         158
SUBARU           140
VOLVO            124
INFINITI         108
BUICK            103
RAM               97
LINCOLN           96
MITSUBISHI        95
CHRYSLER          88
LAND ROVER        85
FIAT              73
ACURA             72
MASERATI          61
ROLLS-ROYCE       50
ASTON MARTIN      47
BENTLEY           46
LAMBORGHINI       41
ALFA ROMEO        30
GENESIS           25
SCION             22
SMART              7
BUGATTI            3
SRT                2
Name: Make, dtype: int64

In [14]:
df.Model.value_counts()

F-150 FFV 4X4           32
F-150 FFV               32
MUSTANG                 27
FOCUS FFV               24
SONIC                   20
                        ..
AVENTADOR S ROADSTER     1
HURACAN AWD              1
HURACAN SPYDER AWD       1
LS 500                   1
XC40 T4 AWD              1
Name: Model, Length: 2053, dtype: int64

**Fuel Type**

- **X** = Regular gasoline
- **Z** = Premium gasoline
- **D** = Diesel
- **E** = Ethanol (E85)
- **N** = Natural gas

In [15]:
df['Fuel Type'].value_counts()

X    3637
Z    3202
E     370
D     175
N       1
Name: Fuel Type, dtype: int64

In [16]:
df['Vehicle Class'].value_counts()

SUV - SMALL                 1217
MID-SIZE                    1133
COMPACT                     1022
SUV - STANDARD               735
FULL-SIZE                    639
SUBCOMPACT                   606
PICKUP TRUCK - STANDARD      538
TWO-SEATER                   460
MINICOMPACT                  326
STATION WAGON - SMALL        252
PICKUP TRUCK - SMALL         159
MINIVAN                       80
SPECIAL PURPOSE VEHICLE       77
VAN - PASSENGER               66
STATION WAGON - MID-SIZE      53
VAN - CARGO                   22
Name: Vehicle Class, dtype: int64

***Observations***

- Most of the object columns has more than 10 unique values. So, we can't use One Hot Encoding which will increase the dimension of this data.

# Feature Extraction

In [17]:
df['NumOfGears'] = df['Transmission'].str.extract(r'(\d+)')
df['Transmission_type'] = df['Transmission'].str.replace(r'\d+', '')
df.head()

  df['Transmission_type'] = df['Transmission'].str.replace(r'\d+', '')


Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km),NumOfGears,Transmission_type
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196,5,AS
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221,6,M
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136,7,AV
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255,6,AS
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244,6,AS


In [18]:
df.Transmission_type.value_counts()

AS    3127
A     1851
M     1185
AM     646
AV     576
Name: Transmission_type, dtype: int64

In [19]:
df.NumOfGears.value_counts()

6     3259
8     1802
7     1026
9      419
5      307
10     210
4       67
Name: NumOfGears, dtype: int64

# Data Viz

In [20]:
fig = make_subplots(rows=2, cols=4, subplot_titles=df[numerical_cols].columns)

fig.add_trace(go.Histogram(x=df['Engine Size(L)'], name='Engine Size(L)'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['Cylinders'], name='Cylinders'), row=1, col=2)
fig.add_trace(go.Histogram(x=df['Fuel Consumption City (L/100 km)'], name='Fuel Consumption City (L/100 km)'), row=1, col=3)
fig.add_trace(go.Histogram(x=df['Fuel Consumption Hwy (L/100 km)'], name='Fuel Consumption Hwy (L/100 km)'), row=1, col=4)
fig.add_trace(go.Histogram(x=df['Fuel Consumption Comb (L/100 km)'], name='Fuel Consumption Comb (L/100 km)'), row=2, col=1)
fig.add_trace(go.Histogram(x=df['Fuel Consumption Comb (mpg)'], name='Fuel Consumption Comb (mpg)'), row=2, col=2)
fig.add_trace(go.Histogram(x=df['CO2 Emissions(g/km)'], name='CO2 Emissions(g/km)'), row=2, col=3)

# Update layout
fig.update_layout(title='Individual Displots of Seven Columns', showlegend=False)
fig.update_xaxes(title_text='Value', row=2, col=2)
fig.update_yaxes(title_text='Probability Density', row=1, col=1)

# Show plot
fig.show()


In [21]:
fig = make_subplots(rows=2, cols=4, subplot_titles=df[numerical_cols].columns)

fig.add_trace(go.Box(x=df['Engine Size(L)'], name='Engine Size(L)'), row=1, col=1)
fig.add_trace(go.Box(x=df['Cylinders'], name='Cylinders'), row=1, col=2)
fig.add_trace(go.Box(x=df['Fuel Consumption City (L/100 km)'], name='FC City (L/100 km)'), row=1, col=3)
fig.add_trace(go.Box(x=df['Fuel Consumption Hwy (L/100 km)'], name='FC Hwy (L/100 km)'), row=1, col=4)
fig.add_trace(go.Box(x=df['Fuel Consumption Comb (L/100 km)'], name='FC Comb (L/100 km)'), row=2, col=1)
fig.add_trace(go.Box(x=df['Fuel Consumption Comb (mpg)'], name='FC Comb (mpg)'), row=2, col=2)
fig.add_trace(go.Box(x=df['CO2 Emissions(g/km)'], name='CO2 Emissions(g/km)'), row=2, col=3)

# Update layout
fig.update_layout(title='Individual Box Plots of Seven Columns', showlegend=False)

# Rotate y-axis labels
fig.update_yaxes(tickangle=90)

In [22]:
counts = df['Fuel Type'].value_counts()

labels = ['Regular gasoline', 'Premium gasoline', 'Ethanol (E85)', 'Diesel', 'Natural Gas']

fig = px.pie(values=counts, names=labels, title='Distribution of Fuel Types')
fig.show()

In [23]:
counts = df['Vehicle Class'].value_counts()

fig = px.pie(values=counts, names=counts.index, title='Distribution of Vehicle Class')
fig.show()

In [24]:
scat = ['City L/100 km', 'Hwy L/100 km', 'Comb L/100 km', 'Comb mpg', 'CO2 g/km']

fig = px.scatter_matrix(df,
    dimensions=['Fuel Consumption City (L/100 km)',
                'Fuel Consumption Hwy (L/100 km)',
                'Fuel Consumption Comb (L/100 km)',
                'Fuel Consumption Comb (mpg)',
                'CO2 Emissions(g/km)'],
    title="Scatter matrix of Fuel Consumption",
    labels=dict(zip(['Fuel Consumption City (L/100 km)',
                     'Fuel Consumption Hwy (L/100 km)',
                     'Fuel Consumption Comb (L/100 km)',
                     'Fuel Consumption Comb (mpg)',
                     'CO2 Emissions(g/km)'], scat))
)
fig.update_traces(diagonal_visible=False)
fig.update_layout(height=800)
fig.update_traces(diagonal_visible=False)
fig.show()

In [25]:
fig = px.scatter(df, x='Fuel Consumption City (L/100 km)', y='CO2 Emissions(g/km)', color='Fuel Type')

label_mapping = {'Z': 'Premium gasoline', 'X': 'Regular gasoline', 'D': 'Diesel', 'E': 'Ethanol (E85)', 'N': 'Natural gas'}

fig.for_each_trace(lambda trace: trace.update(name=label_mapping.get(trace.name, trace.name)))

fig.show()

In [26]:
fig = px.scatter(df, x='Fuel Consumption Hwy (L/100 km)', y='CO2 Emissions(g/km)', color='Fuel Type')

label_mapping = {'Z': 'Premium gasoline', 'X': 'Regular gasoline', 'D': 'Diesel', 'E': 'Ethanol (E85)', 'N': 'Natural gas'}

fig.for_each_trace(lambda trace: trace.update(name=label_mapping.get(trace.name, trace.name)))

fig.show()

- A = Automatic
- AM = Automated manual
- AS = Automatic with select shift
- AV = Continuously variable
- M = Manual

In [27]:
fig = px.scatter(df, x='NumOfGears', y='CO2 Emissions(g/km)', color='Transmission')

label_mapping = {'AS': 'Automatic with select shift', 'M': 'Manual', 'AV': 'Continuously variable', 'AM': 'Automated manual', 'A': 'Automatic'}

fig.for_each_trace(lambda trace: trace.update(name=label_mapping.get(trace.name, trace.name)))

fig.show()

In [28]:
corr_matrix = df[numerical_cols].corr().round(2)

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.columns.tolist(),
    colorscale='Viridis',
    showscale=True  # Show color scale
)

fig.update_layout(
    title='Pearson Correlation Heatmap',
    xaxis=dict(title='Features'),
    yaxis=dict(title='Features'),
    font=dict(size=10),  # Adjust font size
    margin=dict(t=100, r=100),  # Adjust margins to prevent overlapping
)

fig.show()


# Data Preprocessing

***Data Spliting***

In [29]:
X = df.drop(columns=['CO2 Emissions(g/km)'])
y = df['CO2 Emissions(g/km)']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
X_train_og, X_test_og, y_train_og, y_test_og = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()

In [32]:
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)

X_train shape (5908, 13)
X_test shape (1477, 13)
y_train shape (5908,)
y_test shape (1477,)


**Categorical Encoding**

- James-Stein Encoder is used

In [33]:
cont_cols = ['Engine Size(L)',
             'Fuel Consumption City (L/100 km)',
             'Fuel Consumption Hwy (L/100 km)',
             'Fuel Consumption Comb (L/100 km)',
             'Fuel Consumption Comb (mpg)',
            ]

In [34]:
categorical_cols

['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']

In [35]:
# Target Encoder
encoder = ce.JamesSteinEncoder(sigma=0.1)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', encoder, ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type', 'Transmission_type']),
        ('num', StandardScaler(), cont_cols)
    ])

# Model

## Linear Regression

In [36]:
lin_reg_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lin_reg', LinearRegression())
])

scores = -1 * cross_val_score(lin_reg_pipe, X_train, y_train,
                              cv=10,
                              scoring='neg_mean_absolute_error')

print("Mean MAE scores:\n", scores)

Mean MAE scores:
 [11.77767115 12.38450585 13.75364898 12.37496404 14.02555611 13.52263942
 12.94275962 13.40285095 11.39882192 12.5386245 ]


In [37]:
lin_reg = lin_reg_pipe.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

print("R2 score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))

R2 score:  0.8900786557643702
Mean Absolute Error:  12.300258482008903


## Ridge Regression

In [38]:
def get_score(alpha):
    """Return the average MAE over 10 CV folds of Ridge Regression model.

    Keyword argument:
    alpha -- Regularization parameter for Ridge Regression
    """
    RidgeCV_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('Ridge_reg', Ridge(alpha))
    ])

    scores = -1 * cross_val_score(RidgeCV_pipe, X_train, y_train,
                              cv=10,
                              scoring='neg_mean_absolute_error')
    return scores.mean().round(5)

In [39]:
alpha_list = [1e-3, 1e-2, 1e-1, 1, 10]

results = {}
for alpha in alpha_list:
    results[alpha] = get_score(alpha)

In [40]:
results

{0.001: 12.8122, 0.01: 12.81218, 0.1: 12.81202, 1: 12.81225, 10: 12.82132}

In [41]:
RidgeCV_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('Ridge_reg', Ridge(alpha=0.1))
])

ridge_reg = RidgeCV_pipe.fit(X_train, y_train)
y_pred = ridge_reg.predict(X_test)


print("R2 score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))

R2 score:  0.8900883517544558
Mean Absolute Error:  12.300670211305185


## SGD Regression

In [42]:
SGD_reg_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('sgd_reg', SGDRegressor(loss="huber", max_iter=1000, shuffle=True, learning_rate='optimal', alpha=0.1))
])

scores = -1 * cross_val_score(SGD_reg_pipe, X_train, y_train,
                              cv=10,
                              scoring='neg_mean_absolute_error')

print("Mean MAE scores:\n", scores)

Mean MAE scores:
 [14.34846454 15.06027504 16.84757829 14.68732519 16.57680916 17.16268252
 15.9751468  16.60161861 13.66715473 14.88972289]


In [43]:
SGD_reg = SGD_reg_pipe.fit(X_train, y_train)
y_pred = SGD_reg.predict(X_test)

print("R2 score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))

R2 score:  0.8198000765214826
Mean Absolute Error:  15.176316885034236


## SVM

In [44]:
svr_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svr', SVR(kernel='rbf'))
])

scores = -1 * cross_val_score(svr_pipe, X_train, y_train,
                              cv=10,
                              scoring='neg_mean_absolute_error')

print("Mean MAE scores:\n", scores)

Mean MAE scores:
 [14.28388734 15.42914423 17.35678361 15.262201   16.75039219 17.19963
 16.62630967 16.32819777 14.02869992 15.75531837]


In [45]:
sv_reg = svr_pipe.fit(X_train, y_train)
y_pred = sv_reg.predict(X_test)

print("R2 score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))

R2 score:  0.822457657871754
Mean Absolute Error:  15.222515396138881


## Random Forest

In [46]:
encoder = ce.JamesSteinEncoder(sigma=0.1)
X_train_rf = encoder.fit_transform(X_train_og, y_train_og)
X_test_rf = encoder.transform(X_test_og)

In [47]:
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, scoring='r2', n_iter=100, cv=5, verbose=2, random_state=42)
random_search.fit(X_train_rf, y_train_og)
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

*  **Best parameters:** {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}
*  **Best score:**  0.9967127408264066



In [48]:
rf_reg = RandomForestRegressor(n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=20)
rf_reg.fit(X_train_rf, y_train_og)
y_pred = rf_reg.predict(X_test_rf)

print("R2 score: ", r2_score(y_test_og, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test_og, y_pred))

R2 score:  0.9890645688810518
Mean Absolute Error:  2.904938447264603
