In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

import datetime

In [2]:
df = pd.read_pickle("datos/3.reg_enc_est.pkl")
df.head()

Unnamed: 0,seasons,yryr_0,yryr_1,mnthmnth_1,mnthmnth_2,mnthmnth_3,mnthmnth_4,mnthmnth_5,mnthmnth_6,mnthmnth_7,...,weekday_numweekday_num_6,workingday_numworkingday_num_0,workingday_numworkingday_num_1,weathersitweathersit_1,weathersitweathersit_2,weathersitweathersit_3,atemp,hum,windspeed,registered
0,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-0.680818,1.267606,-0.363437,-1.927745
1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,-0.741507,0.480415,0.874656,-1.91748
2,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,-1.750344,-1.376017,0.871424,-1.558846
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,-1.610886,-0.27741,-0.365545,-1.414494
4,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,-1.505615,-1.378284,0.008358,-1.373434


In [3]:
# Separar nuestro dataframe en X e y

X = df.drop("registered", axis = 1)
y = df["registered"]

In [4]:
# y dividir nuestros datos en train y test para poder evaluar la bondad de nuestro modelo

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [5]:
# creamos el objeto del modelo, al igual que hacíamos en la regresión lineal
arbol = DecisionTreeRegressor(random_state =0)

# ajustamos el modelo, igual que en la regresión lienal. 
arbol.fit(x_train, y_train)

DecisionTreeRegressor(random_state=0)

In [6]:
# max depth
print(arbol.tree_.max_depth)

23


In [7]:
# max features. Como vemos, debemos poner en nuestro modelo una profudidad máxima de 4. 

max_features = np.sqrt(len(x_train.columns))
max_features

5.656854249492381

In [8]:
# hacemos las predicciones sobre los dos set de datos el X_test y el X_train
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [9]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [10]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados

dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Reg_enc_est_Decission Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.327075,0.213017,0.461538,0.760245,test,Reg_enc_est_Decission Tree I
1,0.0,0.0,0.0,1.0,train,Reg_enc_est_Decission Tree I


Overfitting

In [11]:
# lo primero que tenemos que hacer es definir un diccionario con los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [5,6,8], # teniendo en cuenta que teníamos overfitting tendremos que reducir la profundidad del modelo, la nuestra anterior era de 17. Bajaremos mucho este valor ya que teníamos un overfitting muy claro
        "max_features": [5,6,7],# Ponemos como límite el 4 ya que es el resultado de la raiz cuadrada. 
        # estos dos hiperparámetros son más difíciles de definir, pero usualmente se suelen elegir los siguientes valores
        "min_samples_split": [5, 10, 20],
        "min_samples_leaf": [5,10,20]} 

In [12]:
# una vez creado el diccionario iniciaremos el modelo con GridSearch

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [13]:
# ajustamos el modelo que acabamos de definir en el GridSearch

gs.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [5, 6, 8], 'max_features': [5, 6, 7],
                         'min_samples_leaf': [5, 10, 20],
                         'min_samples_split': [5, 10, 20]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=-1)

In [14]:
# este método nos esta diciendo que el mejor modelo es aquel que tiene una profundidad de 6, que usa 4 variables predictoras para construir el modelo y que tiene  un min_samples_leaf y un min_samples_split de 10. 
mejor_modelo = gs.best_estimator_
mejor_modelo

DecisionTreeRegressor(max_depth=8, max_features=7, min_samples_leaf=5,
                      min_samples_split=10)

In [15]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [24]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Reg_enc_est_3_Decision tree II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.380791,0.266523,0.516258,0.700023,test,Reg_enc_est_3_Decision tree II
1,0.313225,0.180147,0.424437,0.824447,train,Reg_enc_est_3_Decision tree II


In [17]:
# vamos  a juntar los dataframes de los resultados de los modelos para poder compararlos mejor

df_decision_results = pd.concat([dt_results1, dt_results2], axis = 0).reset_index(drop=True)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.327075,0.213017,0.461538,0.760245,test,Reg_enc_est_Decission Tree I
1,0.0,0.0,0.0,1.0,train,Reg_enc_est_Decission Tree I
2,0.380791,0.266523,0.516258,0.700023,test,Reg_enc_est_3_Decision tree II
3,0.313225,0.180147,0.424437,0.824447,train,Reg_enc_est_3_Decision tree II


In [25]:
df_res_DT_1 = pd.read_csv("datos/resultados/1.reg_enc_DT_2.csv", index_col=0)
df_res_DT_1 

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
2,1014.973032,1419733.0,1191.52542,0.342281,test,Reg_enc_2_Decision tree II
3,935.141311,1239350.0,1113.261188,0.502884,train,Reg_enc_2_Decision tree II
4,548.013699,684980.8,827.635698,0.682669,test,Reg_enc_Decission Tree I
5,0.0,0.0,0.0,1.0,train,Reg_enc_Decission Tree I
6,549.175471,581592.9,762.622368,0.730566,test,Reg_enc_Decision tree II
7,480.115576,403152.8,634.943177,0.838291,train,Reg_enc_Decision tree II


In [26]:
df_decision_1_2 = pd.concat([df_decision_results, df_res_DT_1 ], axis = 0).reset_index(drop=True)
df_decision_1_2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.327075,0.2130171,0.461538,0.760245,test,Reg_enc_est_Decission Tree I
1,0.0,0.0,0.0,1.0,train,Reg_enc_est_Decission Tree I
2,0.380791,0.2665228,0.516258,0.700023,test,Reg_enc_est_3_Decision tree II
3,0.313225,0.1801467,0.424437,0.824447,train,Reg_enc_est_3_Decision tree II
4,1014.973032,1419733.0,1191.52542,0.342281,test,Reg_enc_2_Decision tree II
5,935.141311,1239350.0,1113.261188,0.502884,train,Reg_enc_2_Decision tree II
6,548.013699,684980.8,827.635698,0.682669,test,Reg_enc_Decission Tree I
7,0.0,0.0,0.0,1.0,train,Reg_enc_Decission Tree I
8,549.175471,581592.9,762.622368,0.730566,test,Reg_enc_Decision tree II
9,480.115576,403152.8,634.943177,0.838291,train,Reg_enc_Decision tree II


In [27]:
df_decision_1_2.drop([0,1,6,7], axis=0, inplace=True)
df_decision_1_2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
2,0.380791,0.2665228,0.516258,0.700023,test,Reg_enc_est_3_Decision tree II
3,0.313225,0.1801467,0.424437,0.824447,train,Reg_enc_est_3_Decision tree II
4,1014.973032,1419733.0,1191.52542,0.342281,test,Reg_enc_2_Decision tree II
5,935.141311,1239350.0,1113.261188,0.502884,train,Reg_enc_2_Decision tree II
8,549.175471,581592.9,762.622368,0.730566,test,Reg_enc_Decision tree II
9,480.115576,403152.8,634.943177,0.838291,train,Reg_enc_Decision tree II


In [28]:
# ademas vamos a guardar este dataframe en un csv para 

df_decision_1_2.to_csv("datos/resultados/1.reg_enc_est_DT_3.csv")

In [22]:
# vamos a crearnos un dataframe con las variables predictoras según su importancia
importancia_predictores = pd.DataFrame(
                            {'predictor': x_train.columns,
                             'importancia': arbol.feature_importances_}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores.sort_values(by=["importancia"], ascending=False, inplace = True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia
2,yryr_1,0.337222
29,atemp,0.281736
0,seasons,0.129511
31,windspeed,0.063593
30,hum,0.054442
17,weekday_numweekday_num_0,0.028487
23,weekday_numweekday_num_6,0.018756
28,weathersitweathersit_3,0.0112
11,mnthmnth_9,0.008687
12,mnthmnth_10,0.008398
