In [79]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('once')

In [80]:
#Abrimos el df registrados con encoding 
df= pd.read_csv("../data/03_casuales_cod.csv", index_col=0)
df.head(2)

Unnamed: 0,año,vacaciones,sensacion_termica,humedad,viento,casuales,mes_map,estacion_map,dia_semana_map,clima_map,laborable_map
0,0,1,18.18125,80.5833,10.749882,331,0,0,1,1,0
1,0,0,17.68695,69.6087,16.652113,131,0,0,1,1,1


In [81]:
# separamos nuestro dataframe en X e y

X = df.drop("casuales", axis = 1)
y = df["casuales"]

In [82]:
# dividimos nuestros datos en train y test para poder evaluar la validez de nuestro modelo

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [83]:
# creamos el objeto del modelo
arbol = DecisionTreeRegressor(random_state =0)

# ajustamos el modelo
arbol.fit(x_train, y_train)

In [84]:
max_features = np.sqrt(len(x_train.columns))
max_features

3.1622776601683795

In [85]:
# max depth

print(arbol.tree_.max_depth)

22


In [86]:
# hacemos las predicciones sobre los dos set de datos el X_test y el X_train
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [87]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [88]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados

dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decission Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,269.705479,175963.486301,419.480019,0.633733,test,Decission Tree I
1,0.0,0.0,0.0,1.0,train,Decission Tree I


In [89]:
# lo primero que tenemos que hacer es definir un diccionario con los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [8,10,11,13], 
        "max_features": [1,2,3,4],
        "min_samples_split": [20,30,50],
        "min_samples_leaf": [5,20,35]} 

In [90]:
# una vez creado el diccionario iniciaremos el modelo con GridSearch

gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [91]:
# ajustamos el modelo que acabamos de definir en el GridSearch

gs.fit(x_train, y_train)

In [92]:
# este método nos esta diciendo que el mejor modelo es aquel que tiene una profundidad de 10, que usa 3 variables predictoras para construir el modelo y que tiene  un min_samples_leaf y un min_samples_split de 10. 
mejor_modelo = gs.best_estimator_
mejor_modelo

In [93]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [94]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")
dt_results2


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,262.061331,137897.710452,371.34581,0.712967,test,Decision tree II
1,293.128339,177811.460636,421.676962,0.619822,train,Decision tree II


In [95]:
# vamos  a juntar los dataframes de los resultados de los modelos para poder compararlos mejor

df_decTree_cod_results = pd.concat([dt_results1, dt_results2,], axis = 0)
df_decTree_cod_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,269.705479,175963.486301,419.480019,0.633733,test,Decission Tree I
1,0.0,0.0,0.0,1.0,train,Decission Tree I
0,262.061331,137897.710452,371.34581,0.712967,test,Decision tree II
1,293.128339,177811.460636,421.676962,0.619822,train,Decision tree II


In [103]:
# volvemos adefinir un diccionario con los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [9,10,11,12], 
        "max_features": [1,2,3],
        "min_samples_split": [20,30,55],
        "min_samples_leaf": [5, 20, 40]} 

In [104]:
gs_1 = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param,
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 


In [105]:
gs_1.fit(x_train, y_train)

In [106]:
mejor_modelo_1 = gs_1.best_estimator_
mejor_modelo_1

In [107]:
y_pred_test_dt3 = mejor_modelo_1.predict(x_test)
y_pred_train_dt3 = mejor_modelo_1.predict(x_train)

In [108]:
dt_results3 = metricas(y_test, y_train, y_pred_test_dt3, y_pred_train_dt3, "Decision tree III")
dt_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,310.074155,198346.378518,445.360953,0.587143,test,Decision tree III
1,276.616506,165458.610834,406.766039,0.646234,train,Decision tree III


In [109]:
df_decision_results = pd.concat([dt_results1, dt_results2,dt_results3], axis = 0)
df_decision_results


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,269.705479,175963.486301,419.480019,0.633733,test,Decission Tree I
1,0.0,0.0,0.0,1.0,train,Decission Tree I
0,262.061331,137897.710452,371.34581,0.712967,test,Decision tree II
1,293.128339,177811.460636,421.676962,0.619822,train,Decision tree II
0,310.074155,198346.378518,445.360953,0.587143,test,Decision tree III
1,276.616506,165458.610834,406.766039,0.646234,train,Decision tree III


In [111]:
df_esta = pd.read_csv("../data/04_casuales_cod_sta.csv", index_col = 0)

In [113]:

X1 = df_esta.drop("casuales", axis = 1)
y1 = df_esta["casuales"]

In [114]:
x1_train, x1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [115]:
# creamos el objeto del modelo
arbol = DecisionTreeRegressor(random_state =0)

# ajustamos el modelo
arbol.fit(x1_train, y1_train)

In [116]:
max_features = np.sqrt(len(x1_train.columns))
max_features

3.1622776601683795

In [117]:
print(arbol.tree_.max_depth)

22


In [123]:
# hacemos las predicciones sobre los dos set de datos el X_test y el X_train
y1_pred_test_dt = arbol.predict(x1_test)
y1_pred_train_dt = arbol.predict(x1_train)

In [134]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [135]:
dt_results1 = metricas(y1_test, y1_train, y1_pred_test_dt, y1_pred_train_dt, "Decission Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,273.59589,179402.212329,423.558983,0.626575,test,Decission Tree I
1,0.0,0.0,0.0,1.0,train,Decission Tree I


In [136]:
# lo primero que tenemos que hacer es definir un diccionario con los hiperparámetros que queremos modificar y los valores que queremos 

param = {"max_depth": [8,10,11,13], 
        "max_features": [1,2,3,4],
        "min_samples_split": [20,30,50],
        "min_samples_leaf": [5,20,35]} 

In [137]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [139]:
# ajustamos el modelo que acabamos de definir en el GridSearch

gs.fit(x1_train, y1_train)

In [140]:
# este método nos esta diciendo que el mejor modelo es aquel que tiene una profundidad de 10, que usa 3 variables predictoras para construir el modelo y que tiene  un min_samples_leaf y un min_samples_split de 10. 
mejor_modelo = gs.best_estimator_
mejor_modelo

In [141]:
y1_pred_test_dt2 = mejor_modelo.predict(x1_test)
y1_pred_train_dt2 = mejor_modelo.predict(x1_train)

In [142]:
dt_results2 = metricas(y1_test, y1_train, y1_pred_test_dt2, y1_pred_train_dt2, "Decision tree II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,310.753752,194737.058136,441.29022,0.594656,test,Decision tree II
1,266.558861,147392.405507,383.917186,0.684861,train,Decision tree II


In [143]:
# vamos  a juntar los dataframes de los resultados de los modelos para poder compararlos mejor

df_decTree_cod_results = pd.concat([dt_results1, dt_results2,], axis = 0)
df_decTree_cod_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,273.59589,179402.212329,423.558983,0.626575,test,Decission Tree I
1,0.0,0.0,0.0,1.0,train,Decission Tree I
0,310.753752,194737.058136,441.29022,0.594656,test,Decision tree II
1,266.558861,147392.405507,383.917186,0.684861,train,Decision tree II


In [None]:
#df_decision_results.to_csv('../data/05_dectre_casuales.csv')