# ML to predict EMISIONES_EURO
The objective of this notebook is no create a ML model capable pf predicting EMISIONES_EURO in order to fill the _null_ values in the dataset

In [2]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint
import polars as pl
import numpy as np
import pickle
import random
import os

from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

### Test of a rf with a numerical variable

In [24]:
from utils.dictionaries import types_parque_post

path = os.path.join("..","Data", "DGT")
clean_park = os.path.join(path,'Parque_exacto','clean_park.csv')

parke = pl.scan_csv(clean_park,separator='|', schema=types_parque_post).select(['EMISIONES_CO2','EMISIONES_EURO']).head(1000000)
parke = parke.filter(pl.col('EMISIONES_CO2').is_not_null(),
             pl.col('EMISIONES_EURO').is_not_null()).head(100000)
parke = parke.collect()

In [25]:
X = parke.select('EMISIONES_CO2').to_numpy()
y = parke.select('EMISIONES_EURO').to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [26]:
rf = RandomForestClassifier(n_estimators=100,max_depth=20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.31842424242424244


### Test of a rf with a categorical variable

In [6]:
from utils.dictionaries import types_parque_post

path = os.path.join("..","Data", "DGT")
clean_park = os.path.join(path,'Parque_exacto','clean_park.csv')

parke = pl.scan_csv(clean_park,separator='|', schema=types_parque_post).select(['MARCA','EMISIONES_EURO']).head(1000000)
parke = parke.filter(pl.col('MARCA').is_not_null(),
             pl.col('EMISIONES_EURO').is_not_null()).head(100000)
parke = parke.collect()

In [7]:
X = parke.select('MARCA').to_dummies(drop_first=True).to_numpy()
y = parke.select('EMISIONES_EURO').to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [11]:
rf = RandomForestClassifier(n_estimators=100,max_depth=20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.24881818181818183


### Test of a rf with a numerical and a categorical variable

In [2]:
from utils.dictionaries import types_parque_post

path = os.path.join("..","Data", "DGT")
clean_park = os.path.join(path,'Parque_exacto','clean_park.csv')

parke = pl.scan_csv(clean_park,separator='|', schema=types_parque_post).select(['MARCA','EMISIONES_CO2','EMISIONES_EURO']).head(1000000)
parke = parke.filter(pl.col('MARCA').is_not_null(),
             pl.col('EMISIONES_EURO').is_not_null(),
             pl.col('EMISIONES_CO2').is_not_null()).head(100000)
parke = parke.collect()

In [3]:
X = parke.select(['MARCA','EMISIONES_CO2']).to_dummies('MARCA',drop_first=True).to_numpy()
y = parke.select('EMISIONES_EURO').to_numpy().ravel()
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [49]:
rf = RandomForestClassifier(n_estimators=100,max_depth=20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.4876969696969697


### Building of a workflow for posterioir scaling of the model

We need:
- A file from which we read the data (_clean_park for us_)
- List of the columns we will use *features and target included* (_columns for us_)
- List of the feature columns (_features_ for us)
- Name of the target variable (_'EMISIONES_EURO'_ for us)
- List of the features columns that are categorical (_categorical_features_ for us)

In [None]:
from utils.dictionaries import types_parque_post

path = os.path.join("..","Data", "DGT")
clean_park = os.path.join(path,'Parque_exacto','clean_park.csv')
rf_file = os.path.join("..","Models","rf_model.pkl")

columns = ['MARCA','EMISIONES_CO2','EMISIONES_EURO']
target = 'EMISIONES_EURO'
features = [col for col in columns if col != target]
categorical_features = []
for key in types_parque_post:
    if key in columns:
        if (types_parque_post[key] == pl.String) & (key!=target):
            categorical_features.append(key)

parke = pl.scan_csv(clean_park,separator='|', schema=types_parque_post).select(columns).head(1000000)
parke = parke.drop_nulls().head(100000)
parke = parke.collect()

X = parke.select(features).to_dummies(categorical_features,drop_first=True).to_numpy()
y = parke.select(target).to_numpy().ravel()
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

rf = RandomForestClassifier(n_estimators=100,
                            max_depth=20,
                            n_jobs=2)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
with open(rf_file,'wb') as f:
    pickle.dump(rf,f)

with open(rf_file, 'rb') as f:
    rf = pickle.load(f)

### Hyperparameters tunning

Hyperparameters of the model:
- n_estimators: number of trees
  - n_estimators = randint(50,500)
- max_depth: maximum depth of each tree
  - max_depth = randint(5,40)
- max_features: number of features to consider when looking for the best split
  - max_features = ["log2","sqrt",None]
- min_samples_split: The minimum number of samples required to split an internal node
  - min_samples_split = [2, 5, 10]
- min_samples_leaf: The minimum number of samples required to be at a leaf node
  - min_samples_leaf = [1, 2, 4]

In [9]:
random_grid = {'n_estimators': randint(50,500),
               'max_features': ['log2', 'sqrt', None],
               'max_depth': randint(5,40),
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4]}
rf = RandomForestClassifier()
rand_search = RandomizedSearchCV(rf,param_distributions = random_grid,n_iter=5, cv=5, n_jobs=-1)
rand_search.fit(X_train, y_train)
best_rf = rand_search.best_estimator_
with open(rf_file,'wb') as f:
    pickle.dump(best_rf,f)



In [None]:
rf = RandomForestClassifier(n_estimators=100,
                            max_depth=20,
                            n_jobs=2)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [11]:
best_rf = rand_search.best_estimator_

print('Best hyperparameters:',  rand_search.best_params_)
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Best hyperparameters: {'max_depth': 33, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 451}
Accuracy: 0.5183636363636364


'max_depth': 26, 'max_features': None, 'n_estimators': 117
'max_depth': 33, 'max_features': 'sqrt','n_estimators': 451, 'min_samples_leaf': 1, 'min_samples_split': 2, 

In [14]:
import time
start = time.time()
rf = RandomForestClassifier(n_estimators=117,
                            max_depth=26,
                            max_features=None,
                            n_jobs=-1,)
start = time.time()
rf.fit(X_train, y_train)
fit = time.time() - start
start = time.time()
y_pred = rf.predict(X_test)
pred = time.time() - start
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy premodel:", accuracy, "fit time:", fit, "pred time:", pred)

rf = RandomForestClassifier(n_estimators=451,
                            max_depth=33,
                            max_features='sqrt',
                            n_jobs=-1,)
start = time.time()
rf.fit(X_train, y_train)
fit = time.time() - start
start = time.time()
y_pred = rf.predict(X_test)
pred = time.time() - start
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy premodel:", accuracy, "fit time:", fit, "pred time:", pred)

Accuracy premodel: 0.5176666666666667 fit time: 162.038104057312 pred time: 1.4816601276397705
Accuracy premodel: 0.5189393939393939 fit time: 130.41594076156616 pred time: 5.27468729019165


In [None]:
from utils.dictionaries import types_parque_post

path = os.path.join("..","Data", "DGT")
clean_park = os.path.join(path,'Parque_exacto','clean_park.csv')

columns = ['MARCA','EMISIONES_CO2','EMISIONES_EURO']
target = 'EMISIONES_EURO'
features = [col for col in columns if col != target]
categorical_features = []
for key in types_parque_post:
    if key in columns:
        if (types_parque_post[key] == pl.String) & (key!=target):
            categorical_features.append(key)

parke = pl.scan_csv(clean_park,separator='|', schema=types_parque_post).select(columns).head(1000000)
parke = parke.drop_nulls().head(100000)
parke = parke.collect()

X = parke.select(features).to_dummies(categorical_features,drop_first=True).to_numpy()
y = parke.select(target).to_numpy().ravel()
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

random_grid = {'n_estimators': randint(50,1000),
               'max_features': ['log2', 'sqrt', None],
               'max_depth': randint(5,100),
               'min_samples_split': [2, 5, 10,12],
               'min_samples_leaf': [1, 2, 4,6]}
rf = RandomForestClassifier()
rand_search = RandomizedSearchCV(rf,param_distributions = random_grid,n_iter=5, cv=5, n_jobs=-1)
rand_search.fit(X_train, y_train)
best_rf = rand_search.best_estimator_




NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [5]:
rf_file = os.path.join("..","Models","rf_model2.pkl")
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Best hyperparameters:", best_rf.get_params())
with open(rf_file,'wb') as f:
    pickle.dump(best_rf,f)


Accuracy: 0.5202424242424243
Best hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 57, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 82, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [106]:
best_rf = rand_search.best_estimator_

print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 26, 'max_features': None, 'n_estimators': 117}


In [5]:
from utils.dictionaries import types_parque_post

path = os.path.join("..","Data", "DGT")
clean_park = os.path.join(path,'Parque_exacto','clean_park.csv')

columns = ['MARCA','EMISIONES_CO2','EMISIONES_EURO']
target = 'EMISIONES_EURO'
features = [col for col in columns if col != target]
categorical_features = []
for key in types_parque_post:
    if key in columns:
        if (types_parque_post[key] == pl.String) & (key!=target):
            categorical_features.append(key)

parke = pl.scan_csv(clean_park,separator='|', schema=types_parque_post).select(columns).head(1000000)
parke = parke.drop_nulls().head(100000)
parke = parke.collect()

X = parke.select(features).to_dummies(categorical_features,drop_first=True).to_numpy()
y = parke.select(target).to_numpy().ravel()
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

rf = RandomForestClassifier(n_estimators=117,
                            max_depth=26,
                            n_jobs=2,
                            max_features =  None)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with non max features:", accuracy)

rf = RandomForestClassifier(n_estimators=117,
                            max_depth=26,
                            n_jobs=2,
                            max_features =  'sqrt')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with sqrt max features:", accuracy)

Accuracy with non max features: 0.5184242424242425
Accuracy with sqrt max features: 0.5055757575757576


### Building of the acutual model

Load and divide the dataset

In [2]:
path = os.path.join("..","Data", "DGT")
complete_park = os.path.join(path,'Parque_exacto','clean_park.csv')

In [40]:
columns = ['FABRICANTE','MARCA','MODELO','TIPO','VARIANTE','VERSION','FECHA_MATR','FECHA_PRIM_MATR',
           'CLASE_MATR','SUBTIPO_DGT','TIPO_DGT','CAT_EURO','TARA','PESO_MAX','MOM','MMTA',
           'CILINDRADA','PROPULSION','TIPO_DISTINTIVO','EMISIONES_CO2','EMISIONES_EURO']