In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

import datetime

In [2]:
df = pd.read_pickle("datos/4.casual_encoding_oh_final.pkl")
df.head()

Unnamed: 0,dteday,yr,mnth,dia_anual,holiday_num,weekday_num,workingday_num,weathersit,temp,atemp,hum,windspeed,casual,seasons_autumn,seasons_spring,seasons_summer,seasons_winter
0,2018-01-01,0,1,1,1,0,0,2,14.110847,18.18125,80.5833,10.749882,331,0.0,0.0,0.0,1.0
1,2018-01-02,0,1,2,0,1,1,2,14.902598,17.68695,69.6087,16.652113,131,0.0,0.0,0.0,1.0
2,2018-01-03,0,1,3,0,2,1,1,8.050924,9.47025,43.7273,16.636703,120,0.0,0.0,0.0,1.0
3,2018-01-04,0,1,4,0,3,1,1,8.2,10.6061,59.0435,10.739832,108,0.0,0.0,0.0,1.0
4,2018-01-05,0,1,5,0,4,1,1,9.305237,11.4635,43.6957,12.5223,82,0.0,0.0,0.0,1.0


In [3]:
df.dtypes

dteday              object
yr                category
mnth              category
dia_anual         category
holiday_num       category
weekday_num       category
workingday_num    category
weathersit        category
temp               float64
atemp              float64
hum                float64
windspeed          float64
casual               int64
seasons_autumn    category
seasons_spring    category
seasons_summer    category
seasons_winter    category
dtype: object

In [4]:
df.columns

Index(['dteday', 'yr', 'mnth', 'dia_anual', 'holiday_num', 'weekday_num',
       'workingday_num', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'seasons_autumn', 'seasons_spring', 'seasons_summer',
       'seasons_winter'],
      dtype='object')

In [5]:
# al igual que en la regresión lineal tenemos que separar nuestro dataframe en X e y

X = df[['holiday_num', 'weekday_num', 'workingday_num', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'seasons_autumn', 'seasons_spring', 'seasons_summer', 'seasons_winter']]
y = df["casual"]

In [6]:
# y dividir nuestros datos en train y test para poder evaluar la bondad de nuestro modelo

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [7]:
# creamos el objeto del modelo, al igual que hacíamos en la regresión lineal
arbol = DecisionTreeRegressor(random_state =0)

# ajustamos el modelo, igual que en la regresión lienal. 
arbol.fit(x_train, y_train)

In [8]:
# max features. Como vemos, debemos poner en nuestro modelo una profudidad máxima de 4. 

max_features = np.sqrt(len(x_train.columns))
max_features

3.4641016151377544

In [9]:
# max depth

print(arbol.tree_.max_depth)

21


In [10]:
# hacemos las predicciones sobre los dos set de datos el X_test y el X_train
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [11]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [12]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados

dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Casual_Cod_Decission Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,314.856164,260293.239726,510.189416,0.458201,test,Casual_Cod_Decission Tree I
1,0.0,0.0,0.0,1.0,train,Casual_Cod_Decission Tree I


In [13]:
df.shape

(730, 17)

Overfitting

In [14]:
# lo primero que tenemos que hacer es definir un diccionario con los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [4,6,8],
        "max_features": [4,5,6,7],
        "min_samples_split": [5, 20, 75], 
        "min_samples_leaf": [5,20,75]} 

In [15]:
# una vez creado el diccionario iniciaremos el modelo con GridSearch

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [16]:
# ajustamos el modelo que acabamos de definir en el GridSearch

gs.fit(x_train, y_train)

In [17]:
# este método nos esta diciendo que el mejor modelo es aquel que tiene una profundidad de 6, que usa 4 variables predictoras para construir el modelo y que tiene  un min_samples_leaf y un min_samples_split de 10. 
mejor_modelo = gs.best_estimator_
mejor_modelo

In [18]:
# veamos ahora que pinta tiene nuestro árbol


# fig = plt.figure(figsize=(40, 20))
# tree.plot_tree(mejor_modelo, feature_names=x_train.columns, filled=True);

In [19]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [20]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Casual_Cod_Decision tree II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,329.171448,224042.723748,473.331516,0.533656,test,Casual_Cod_Decision tree II
1,309.201099,193764.367713,440.186742,0.585713,train,Casual_Cod_Decision tree II


In [21]:
df.head(2)

Unnamed: 0,dteday,yr,mnth,dia_anual,holiday_num,weekday_num,workingday_num,weathersit,temp,atemp,hum,windspeed,casual,seasons_autumn,seasons_spring,seasons_summer,seasons_winter
0,2018-01-01,0,1,1,1,0,0,2,14.110847,18.18125,80.5833,10.749882,331,0.0,0.0,0.0,1.0
1,2018-01-02,0,1,2,0,1,1,2,14.902598,17.68695,69.6087,16.652113,131,0.0,0.0,0.0,1.0


In [22]:
# vamos  a juntar los dataframes de los resultados de los modelos para poder compararlos mejor

df_decision_results = pd.concat([dt_results1, dt_results2], axis = 0).reset_index(drop=True)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,314.856164,260293.239726,510.189416,0.458201,test,Casual_Cod_Decission Tree I
1,0.0,0.0,0.0,1.0,train,Casual_Cod_Decission Tree I
2,329.171448,224042.723748,473.331516,0.533656,test,Casual_Cod_Decision tree II
3,309.201099,193764.367713,440.186742,0.585713,train,Casual_Cod_Decision tree II


In [23]:
#df_decision_results.style.background_gradient()

In [24]:
# vamos a crearnos un dataframe 
importancia_predictores = pd.DataFrame({'predictor': x_train.columns,
    'importancia': mejor_modelo.feature_importances_})

# ordenamos de mayor a menor los resultados
importancia_predictores.sort_values(by=["importancia"], ascending=False, inplace = True)
importancia_predictores

Unnamed: 0,predictor,importancia
5,atemp,0.488073
1,weekday_num,0.384371
4,temp,0.048562
6,hum,0.043446
0,holiday_num,0.022545
11,seasons_winter,0.013003
2,workingday_num,0.0
3,weathersit,0.0
7,windspeed,0.0
8,seasons_autumn,0.0


In [25]:
# ademas vamos a guardar este dataframe en un csv para 
df_decision_results.to_csv("datos/6.casual_resultados_1_cod.csv")