In [3]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Barra de progreso de un proceso
# ------------------------------------------------------------------------------
from tqdm import tqdm

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

In [4]:
# Para ejecutar en google colab
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
# Para conectar con Drive
%cd /content/drive/My Drive/Colab Notebooks/

/content/drive/My Drive/Colab Notebooks


In [6]:
df = pd.read_csv("project-da-promo-C-module-3-team-5/datos/4.reg_encoding_oh_final.csv", index_col=0)
df.head()

Unnamed: 0,yr,mnth,dia_anual,holiday_num,weekday_num,workingday_num,weathersit,atemp,registered,seasons_autumn,seasons_spring,seasons_summer,seasons_winter
0,0,1,1,1,0,0,2,18.18125,654,0.0,0.0,0.0,1.0
1,0,1,2,0,1,1,2,17.68695,670,0.0,0.0,0.0,1.0
2,0,1,3,0,2,1,1,9.47025,1229,0.0,0.0,0.0,1.0
3,0,1,4,0,3,1,1,10.6061,1454,0.0,0.0,0.0,1.0
4,0,1,5,0,4,1,1,11.4635,1518,0.0,0.0,0.0,1.0


In [7]:

X = df.drop("registered", axis = 1)
y = df["registered"]

In [8]:
# Generamos los conjuntos de train y test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 43)

In [36]:
# Definimos un diccionario con los hiperparámetros que deberemos ocupar en el random forest

param = {"max_depth": [4, 6, 8, 9], 
        "max_features": [5, 6, 7, 8],
        "min_samples_split": [5, 10, 25],
        "min_samples_leaf": [5, 10, 25]} 

In [37]:
# Aplicamos el GridSearch con estimador de RandomForestRegressor
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [38]:
# Entrenamos el modelo con .fit()
gs_rf.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [4, 6, 8, 9],
                         'max_features': [5, 6, 7, 8],
                         'min_samples_leaf': [5, 10, 25],
                         'min_samples_split': [5, 10, 25]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=-1)

In [39]:
# método para que elija el mejor estimador o combinación de valores
bosque = gs_rf.best_estimator_
bosque

RandomForestRegressor(max_depth=9, max_features=7, min_samples_leaf=5,
                      min_samples_split=5)

In [40]:
# Aplicamos el método .predict() para calcular las métricas del mejor modelo.
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [41]:
# Medimos las métricas del random forest
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [42]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados

dt_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Reg_Cod_Random Forest")
dt_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,437.106265,333635.651427,577.612025,0.849891,test,Reg_Cod_Random Forest
1,363.786806,284968.402993,533.824318,0.885141,train,Reg_Cod_Random Forest


In [43]:
# Unimos estos resultados con los obtenidos en los modelos anteriores

df_RF_DT_results = pd.read_csv("project-da-promo-C-module-3-team-5/datos/6.reg_resultados_1_cod.csv", index_col = 0)
df_RF_DT_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,214.321918,101141.787671,318.027967,0.953144,test,Reg_Cod_Decission Tree I
1,0.0,0.0,0.0,1.0,train,Reg_Cod_Decission Tree I
2,403.047749,264899.404086,514.68379,0.87728,test,Reg_Cod_Decision tree II
3,387.159792,244561.02274,494.531114,0.901904,train,Reg_Cod_Decision tree II


In [44]:
#Unimos los resultados del Random Forest y el Decision Tree:

df_ran_dec = pd.concat([df_RF_DT_results, dt_results], axis=0)
df_ran_dec

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,214.321918,101141.787671,318.027967,0.953144,test,Reg_Cod_Decission Tree I
1,0.0,0.0,0.0,1.0,train,Reg_Cod_Decission Tree I
2,403.047749,264899.404086,514.68379,0.87728,test,Reg_Cod_Decision tree II
3,387.159792,244561.02274,494.531114,0.901904,train,Reg_Cod_Decision tree II
0,437.106265,333635.651427,577.612025,0.849891,test,Reg_Cod_Random Forest
1,363.786806,284968.402993,533.824318,0.885141,train,Reg_Cod_Random Forest


Cotejamos todas las opciones. Tras deliberar y valorar las métricas, optamos por utilizar el modelo de Random Forest por sus puntuaciones tan similares en el R2 y el RMSE, evitando así un modelo con over o underfitting.

De esta manera, con el modelo de Random Forest nuestras variables predictoras o independientes explican un 82% de la varianza en la media de las puntuaciones de la variable resultado o dependiente. Además, cuenta con el RMSE más bajo de todos los modelos, por lo que es el modelo que más se parece a los datos originales.


In [47]:
# vamos a crearnos un dataframe
importancia_predictores3 = pd.DataFrame({'predictor': x_train.columns,
    'importancia': bosque.feature_importances_})
# ordenamos de mayor a menor los resultados
importancia_predictores3.sort_values(by=["importancia"], ascending=False, inplace = True)
importancia_predictores3

Unnamed: 0,predictor,importancia
0,yr,0.390389
7,atemp,0.266993
2,dia_anual,0.134924
11,seasons_winter,0.075883
4,weekday_num,0.067223
6,weathersit,0.024048
1,mnth,0.021518
8,seasons_autumn,0.0131
5,workingday_num,0.004098
9,seasons_spring,0.00121
