# Vehicle CO<sub>2</sub> emissions estimator

In this notebook an ML model will be created to estimate the CO<sub>2</sub> grams per km of each car

In [3]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
pd.set_option("display.max_columns", None)

In [4]:
data = pd.read_parquet(r"../files/vehicle_data_prepared.parquet")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29111 entries, 0 to 29110
Data columns (total 30 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   make                                     29111 non-null  object 
 1   basemodel                                29111 non-null  object 
 2   model                                    29111 non-null  object 
 3   year                                     29111 non-null  int64  
 4   vehicle_size_class                       29111 non-null  object 
 5   cylinders                                29111 non-null  float64
 6   engine_displacement_liters               29111 non-null  float64
 7   t_charger                                29111 non-null  bool   
 8   s_charger                                29111 non-null  bool   
 9   electric_motor                           29111 non-null  object 
 10  transmission                             29111

In [6]:
data.head()

Unnamed: 0,make,basemodel,model,year,vehicle_size_class,cylinders,engine_displacement_liters,t_charger,s_charger,electric_motor,transmission,drive,start_stop,fuel_type,fuel_type1,fuel_type2,phev_blended,city_electricity_consumption_kwhpkm,city_kmpl_for_fuel_type1,city_kmpl_for_fuel_type2,highway_electricity_consumption_kwhpkm,highway_kmpl_for_fuel_type1,highway_kmpl_for_fuel_type2,combined_electricity_consumption_kwhpkm,combined_kmpl_for_fuel_type1,combined_kmpl_for_fuel_type2,hours_to_charge_at_120v,hours_to_charge_at_240v,co2_tailpipe_for_fuel_type1_gpkm,co2_tailpipe_for_fuel_type2_gpkm
0,Kia,Sedona,Sedona,2017,Minivan - 2WD,6.0,3.3,False,False,,Automatic (S6),Front-Wheel Drive,False,Regular,Regular Gasoline,,False,0.0,6.372108,0.0,0.0,8.496144,0.0,0.0,7.08012,0.0,0.0,0.0,275.268122,0.0
1,Genesis,G80,G80 AWD,2017,Large Cars,6.0,3.8,False,False,,Automatic (S8),All-Wheel Drive,False,Regular,Regular Gasoline,,False,0.0,5.664096,0.0,0.0,8.85015,0.0,0.0,6.726114,0.0,0.0,0.0,285.210086,0.0
2,Roush Performance,F150,F150 Pickup 2WD,2016,Standard Pickup Trucks 2WD,8.0,5.0,False,True,,Automatic (S6),Rear-Wheel Drive,False,Premium,Premium Gasoline,,False,0.0,4.248072,0.0,0.0,5.31009,0.0,0.0,4.602078,0.0,0.0,0.0,398.299924,0.0
3,Infiniti,Q50,Q50 AWD,2016,Midsize Cars,6.0,3.0,True,False,,Automatic (S7),All-Wheel Drive,False,Premium,Premium Gasoline,,False,0.0,6.726114,0.0,0.0,9.558162,0.0,0.0,7.788132,0.0,0.0,0.0,252.277331,0.0
4,Porsche,Macan,Macan Turbo,2017,Small Sport Utility Vehicle 4WD,6.0,3.6,True,False,,Automatic (AM-S7),All-Wheel Drive,True,Premium,Premium Gasoline,,False,0.0,6.018102,0.0,0.0,8.142138,0.0,0.0,6.726114,0.0,0.0,0.0,290.181068,0.0


## Feature Selection

In this case, mutual information algorithm will be used to select the features for the ML model.

Mutual information is based on the entropy reduction/uncertainty of the target knowing the feature. It does not measure covariance/correlation between features 

All the columns in X are either ordinal (well establish numbers) or categorical

In [7]:
X = [
    "make",
    "basemodel",
    "year",
    "vehicle_size_class",
    "cylinders",
    "engine_displacement_liters",
    "t_charger",
    "s_charger",
    "electric_motor",
    "transmission",
    "drive",
    "start_stop",
    "fuel_type",
    "fuel_type1",
    "fuel_type2",
    "phev_blended"
     ]

In [8]:
X_data = data[X]
Y = data["co2_tailpipe_for_fuel_type1_gpkm"]

In [9]:
x_cat_index = [X_data.columns.tolist().index(x) for x in X_data.columns]

In [10]:
encoder = OrdinalEncoder()
x_cat_encoded = encoder.fit_transform(X_data) #only categorical and ordinal variables

In [11]:
X_cat_encoded = pd.DataFrame(x_cat_encoded, columns = X_data.columns)

In [12]:
mutual_info_score = mutual_info_regression(X=x_cat_encoded,y=Y, discrete_features=x_cat_index) #discrete features must be the index of the columns that are discrete
#in this case, all the X values are discrete
mutual_info_score = pd.DataFrame(mutual_info_score, index = X_data.columns.tolist())

In [13]:
X_cat_encoded["co2_tailpipe_gpkm"] = Y

In [14]:
X_cat_encoded.head()

Unnamed: 0,make,basemodel,year,vehicle_size_class,cylinders,engine_displacement_liters,t_charger,s_charger,electric_motor,transmission,drive,start_stop,fuel_type,fuel_type1,fuel_type2,phev_blended,co2_tailpipe_gpkm
0,33.0,751.0,17.0,5.0,5.0,25.0,0.0,0.0,391.0,19.0,4.0,0.0,12.0,6.0,3.0,0.0,275.268122
1,22.0,390.0,17.0,1.0,5.0,30.0,0.0,0.0,391.0,21.0,3.0,0.0,12.0,6.0,3.0,0.0,285.210086
2,66.0,349.0,16.0,18.0,6.0,42.0,0.0,1.0,391.0,19.0,6.0,0.0,8.0,5.0,3.0,0.0,398.299924
3,27.0,631.0,16.0,2.0,5.0,22.0,1.0,0.0,391.0,20.0,3.0,0.0,8.0,5.0,3.0,0.0,252.277331
4,60.0,535.0,17.0,10.0,5.0,28.0,1.0,0.0,391.0,4.0,3.0,1.0,8.0,5.0,3.0,0.0,290.181068


In [15]:
mutual_info_score.rename(columns = {0:"mutual_info"}, inplace=True)
mutual_info_score.sort_values("mutual_info", ascending=False)

Unnamed: 0,mutual_info
basemodel,1.934201
engine_displacement_liters,1.223996
transmission,0.885302
make,0.786948
cylinders,0.742397
vehicle_size_class,0.688567
year,0.684961
drive,0.478261
fuel_type,0.347656
electric_motor,0.311798


In [16]:
selected_features = mutual_info_score.query("mutual_info > 0.2").sort_values("mutual_info", ascending=False)
selected_features

Unnamed: 0,mutual_info
basemodel,1.934201
engine_displacement_liters,1.223996
transmission,0.885302
make,0.786948
cylinders,0.742397
vehicle_size_class,0.688567
year,0.684961
drive,0.478261
fuel_type,0.347656
electric_motor,0.311798


In [17]:
selected_features = selected_features.index

In [78]:
pearson_corr = X_cat_encoded.corr(method="pearson", numeric_only=True)["co2_tailpipe_gpkm"].sort_values(ascending=False).reset_index(drop=False)
pearson_corr.query("co2_tailpipe_gpkm >= 0.5 and index != 'co2_tailpipe_gpkm'")

Unnamed: 0,index,co2_tailpipe_gpkm
1,cylinders,0.836443
2,engine_displacement_liters,0.814443
3,electric_motor,0.511839


In [79]:
spearman_corr = X_cat_encoded.corr(method="spearman", numeric_only=True)["co2_tailpipe_gpkm"].sort_values(ascending=False).reset_index(drop=False)
spearman_corr.query("co2_tailpipe_gpkm >= 0.5 and index != 'co2_tailpipe_gpkm'")

Unnamed: 0,index,co2_tailpipe_gpkm
1,engine_displacement_liters,0.857617
2,cylinders,0.830772


## ML Model: Multiple Linear Regression

### Data encoding

In [81]:
X = X_data[selected_features]
Y = Y

In [82]:
num_cols = [x for x in X._get_numeric_data().columns]
cat_cols = [x for x in X.columns if x not in num_cols]
preprocessor = ColumnTransformer(
                                transformers = [
                                                ("cat_cols", OneHotEncoder(handle_unknown="ignore"),cat_cols),#only set the col names
                                                ("num_cols", MinMaxScaler(), num_cols) 
                                                ]
                                )

In [83]:
X_processed = preprocessor.fit_transform(X)

### Cross validation

In [84]:
cross_val_score(
                estimator = LinearRegression(),
                X = X_processed,
                y = Y,
                cv = 5,
                )

array([0.95527286, 0.94893497, 0.95255398, 0.95453295, 0.95417616])

### **Final model construction**

In [85]:
model = Pipeline(steps=[
                ("preprocessing_data", preprocessor),
                ("model", LinearRegression())    
                ]
         )

model.fit(X, Y)

## Deploying the model

In [22]:
import joblib

In [87]:
joblib.dump(model, '../models/co2_emission_estimator.joblib')  # Save model

['../models/co2_emission_estimator.joblib']

## Saving the training dataset

In [88]:
X_data["co2_tailpipe_for_fuel_type1_gpkm"] = Y

X_data.to_parquet("../files/co2_emission_estimator_training_data.parquet", index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_data["co2_tailpipe_for_fuel_type1_gpkm"] = Y


## Testing the deployment

In [23]:
model = joblib.load("../models/co2_emission_estimator.joblib")

In [90]:
selected_features.tolist()

['basemodel',
 'engine_displacement_liters',
 'transmission',
 'make',
 'cylinders',
 'vehicle_size_class',
 'year',
 'drive',
 'fuel_type',
 'electric_motor',
 'fuel_type1',
 'start_stop']

In [91]:
X_data.query("basemodel == 'Prius' and year == 2024")[selected_features]

Unnamed: 0,basemodel,engine_displacement_liters,transmission,make,cylinders,vehicle_size_class,year,drive,fuel_type,electric_motor,fuel_type1,start_stop
6294,Prius,2.0,Automatic (variable gear ratios),Toyota,4.0,Midsize Cars,2024,Front-Wheel Drive,Regular,222V Li-Ion,Regular Gasoline,True
10146,Prius,2.0,Automatic (variable gear ratios),Toyota,4.0,Midsize Cars,2024,Front-Wheel Drive,Regular,222V Li-Ion,Regular Gasoline,True
12599,Prius,2.0,Automatic (variable gear ratios),Toyota,4.0,Midsize Cars,2024,Part-time 4-Wheel Drive,Regular,222V Li-Ion,Regular Gasoline,True
18872,Prius,2.0,Automatic (variable gear ratios),Toyota,4.0,Midsize Cars,2024,Part-time 4-Wheel Drive,Regular,222V Li-Ion,Regular Gasoline,True


In [27]:
predict_vals = pd.DataFrame([[
                "Prius",
                2.0,
                "Automatic (variable gear ratios)",
                "Toyota",
                "4.0",
                "Midsize Cars",
                2024,
                "Front-Wheel Drive",
                "Regular",
                "222V Li-Ion",
                "Regular Gasoline",
                True
                ]], columns = selected_features)

In [20]:
selected_features

Index(['basemodel', 'engine_displacement_liters', 'transmission', 'make',
       'cylinders', 'vehicle_size_class', 'year', 'drive', 'fuel_type',
       'electric_motor', 'fuel_type1', 'start_stop'],
      dtype='object')

In [28]:
model.predict(predict_vals)

array([107.61522024])