In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

from tqdm import tqdm

import warnings
warnings.filterwarnings('once')

In [2]:
df=pd.read_csv('../data/03_casuales_cod.csv', index_col=0)
df.head(2)

Unnamed: 0,año,vacaciones,sensacion_termica,humedad,viento,casuales,mes_map,estacion_map,dia_semana_map,clima_map,laborable_map
0,0,1,18.18125,80.5833,10.749882,331,0,0,1,1,0
1,0,0,17.68695,69.6087,16.652113,131,0,0,1,1,1


In [3]:
#  separamos nuestro dataframe en X e y

X = df.drop("casuales", axis = 1)
y = df["casuales"]

In [4]:
# dividimos nuestros datos en train y test para poder evaluar la validez de nuestro modelo

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:

param = {"max_depth": [9,10,11,12], 
        "max_features": [1,2.5,3],
        "min_samples_split": [19,20,25],
        "min_samples_leaf": [4,5,6,7,15]} 

In [10]:
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error")

In [11]:
# ajustamos el modelo que acabamos de definir en el GridSearch

gs_rf.fit(x_train, y_train)

Traceback (most recent call last):
  File "/Users/ju/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ju/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/ju/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/ju/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/ju/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/ju/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "/Users/ju/op

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [9, 10, 11, 12],
                         'max_features': [1, 2.5, 3],
                         'min_samples_leaf': [4, 5, 6, 7, 15],
                         'min_samples_split': [19, 20, 25]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=-1)

In [12]:
# al igual que el decision tree podemos sacar cual es nuestro mejor bosque
# En este caso,  nuestro mejor bosque es aquel que esta formado por arboles de 6 de profundidad, usa 4 variables y que tiene  un min_samples_leaf y un min_samples_split de 10. 

bosque = gs_rf.best_estimator_
bosque

RandomForestRegressor(max_depth=10, max_features=3, min_samples_leaf=4,
                      min_samples_split=20)

In [13]:
# Calculamos sus métricas
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [14]:
# Con esta función calculamos las métricas y las convertimos en dataframe
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [15]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados
df_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
df_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,245.205378,118248.747873,343.873157,0.753866,test,Random Forest
1,231.431306,115122.631556,339.297261,0.753857,train,Random Forest


In [None]:
#df_todos_resultados.to_csv('06_randForest_registrados.csv')

In [26]:
# import pickle
# with open("rf_75-75_343-339_casauales.pkl", "wb") as fp:
#     pickle.dump(gs_rf, fp)