# DECISION TREE Y RANDOM FOREST
---

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('once')

In [11]:
df_est = pd.read_csv("datos/07-bikes_encoding_est_sinout.csv", index_col = 0)
df_est.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,registered
0,0,0,0,0,0,0,2,14.110847,-0.691698,1.254842,-0.360109,654
1,0,0,1,1,2,1,2,14.902598,-0.752316,0.465708,0.867064,670
2,0,0,1,1,2,1,3,8.050924,-1.759976,-1.395307,0.86386,1229
3,2,0,1,1,0,0,3,8.2,-1.620681,-0.293988,-0.362199,1454
4,2,0,2,1,0,1,3,9.305237,-1.515533,-1.397579,0.008406,1518


In [12]:
df_est.season.unique()

array([0, 2, 3], dtype=int64)

In [13]:
df_est.isnull().sum()

season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
registered    0
dtype: int64

In [14]:
# Creamos una clase para poder hacer el decision tree y el random forest más rápido
class Regres_lineal:
    def __init__(self, dataf, col_pred):
        
        self.dataf = dataf
        self.col_pred = col_pred

    def tree_param(self):
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]

        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

        arbol = DecisionTreeRegressor(random_state =0)
    
        arbol.fit(x_train, y_train)

        max_feat = np.sqrt(len(x_train.columns))
        max_dep = arbol.tree_.max_depth
        return(max_feat, max_dep)

    def metricas(self, y_te, y_tr, y_te_pred, y_tr_pred, tipo_modelo):
        resultados = {'MAE': [mean_absolute_error(y_te, y_te_pred), mean_absolute_error(y_tr, y_tr_pred)],
                    'MSE': [mean_squared_error(y_te, y_te_pred), mean_squared_error(y_tr, y_tr_pred)],
                    'RMSE': [np.sqrt(mean_squared_error(y_te, y_te_pred)), np.sqrt(mean_squared_error(y_tr, y_tr_pred))],
                    'R2':  [r2_score(y_te, y_te_pred), r2_score(y_tr, y_tr_pred)],
                    "set": ["test", "train"]}
        df = pd.DataFrame(resultados)
        df["modelo"] = tipo_modelo
        return df

    def grid_s_decision(self, dict, model):
        self.dict = dict
        self.model = model
        
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        
        gs = GridSearchCV(
            estimator= GradientBoostingRegressor(), 
            param_grid= self.dict, 
            cv=10, 
            verbose=-1,
            return_train_score = True,
            scoring="neg_mean_squared_error")
        gs.fit(x_train, y_train)
        mejor_modelo = gs.best_estimator_
        y_pred_test_dt2 = mejor_modelo.predict(x_test)
        y_pred_train_dt2 = mejor_modelo.predict(x_train)
        dt_results = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, self.model)
        return mejor_modelo, dt_results

    def grid_s_forest(self, dict, model):
        self.dict = dict
        self.model = model
        
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        
        gs = GridSearchCV(
            estimator= RandomForestRegressor(), 
            param_grid= self.dict, 
            cv=10, 
            verbose=-1,
            return_train_score = True,
            scoring="neg_mean_squared_error")
        gs.fit(x_train, y_train)
        mejor_modelo = gs.best_estimator_
        y_pred_test_dt2 = mejor_modelo.predict(x_test)
        y_pred_train_dt2 = mejor_modelo.predict(x_train)
        dt_results = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, self.model)
        return mejor_modelo, dt_results

In [15]:

rg_est = Regres_lineal(df_est, "registered")

In [16]:
features_est, depth_est = rg_est.tree_param()

In [17]:
features_est #El número máx será 3

3.3166247903554

In [18]:
depth_est # La profundidad máxima será 22

18

In [19]:
param1 = {"max_depth": [16, 18, 20], 
        "max_features": [3,4],
        "min_samples_split": [50, 100],
        "min_samples_leaf": [50,100]} 

In [20]:
best_model1, result1 = rg_est.grid_s_decision(param1, "GradientBoosting Estand I")

In [21]:
best_model1

In [22]:
result1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,561.474018,489593.248636,699.709403,0.794348,test,GradientBoosting Estand I
1,476.683306,379651.808366,616.158915,0.843748,train,GradientBoosting Estand I


In [23]:
param2 = {"max_depth": [4,6,8,10, 12, 14, 16, 18, 20], 
        "max_features": [2, 3,4],
        "min_samples_split": [10, 20, 40, 50, 60],
        "min_samples_leaf": [10, 20, 40, 50, 60]} 

In [24]:
best_model2, result2 = rg_est.grid_s_decision(param2, "GradientBoosting Estand II")

In [25]:
best_model2

In [26]:
result2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,519.216193,464426.453406,681.48841,0.804919,test,GradientBoosting Estand II
1,397.279588,266061.248751,515.811253,0.890498,train,GradientBoosting Estand II


In [27]:
param3 = {"max_depth": [6, 8, 10, 12, 14, 16, 18], 
        "max_features": [3,4],
        "min_samples_split": [10, 20, 40],
        "min_samples_leaf": [10, 20, 40]} 

In [28]:
best_model3, result3 = rg_est.grid_s_forest(param3, "RandForest Estand I")

In [29]:
best_model3

In [30]:
result3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,602.385004,561496.544446,749.330731,0.764145,test,RandForest Estand I
1,555.033284,494550.745833,703.24302,0.796459,train,RandForest Estand I


In [77]:
param4 = {"max_depth": [4], 
        "max_features": [3],
        "min_samples_split": [10],
        "min_samples_leaf": [50]} 

In [78]:
best_model4, result4 = rg_est.grid_s_forest(param4, "RandForest Estand II")

In [79]:
best_model4

In [80]:
result4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,814.008949,955029.228928,977.255969,0.598843,test,RandForest Estand II
1,795.98811,962954.857566,981.302633,0.603679,train,RandForest Estand II


In [35]:
df_sinest = pd.read_csv("datos/06-bikes_encoding_sinest_sinout.csv", index_col = 0)
df_sinest.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,registered
0,0,0,0,0,0,0,2,14.110847,18.18125,80.5833,10.749882,654
1,0,0,1,1,2,1,2,14.902598,17.68695,69.6087,16.652113,670
2,0,0,1,1,2,1,3,8.050924,9.47025,43.7273,16.636703,1229
3,2,0,1,1,0,0,3,8.2,10.6061,59.0435,10.739832,1454
4,2,0,2,1,0,1,3,9.305237,11.4635,43.6957,12.5223,1518


In [36]:
rg_sinest = Regres_lineal(df_sinest, "registered")

In [37]:
features_sinest, depth_sinest = rg_sinest.tree_param()

In [38]:
features_sinest #El número máx será 3

3.3166247903554

In [39]:
depth_sinest # La profundidad máxima será 22

18

In [40]:
best_model5, result5 = rg_sinest.grid_s_decision(param1, "GradientBoosting SinEstand I")

In [41]:
result5

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,558.976679,486798.318758,697.709337,0.795522,test,GradientBoosting SinEstand I
1,471.3323,371310.695551,609.352686,0.84718,train,GradientBoosting SinEstand I


In [42]:
best_model6, result6 = rg_sinest.grid_s_decision(param2, "GradientBoosting SinEstand II")

In [43]:
result6

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,564.853554,516591.258699,718.742832,0.783007,test,GradientBoosting SinEstand II
1,474.122232,373900.206137,611.473798,0.846115,train,GradientBoosting SinEstand II


In [44]:
best_model7, result7 = rg_sinest.grid_s_forest(param3, "RandForest SinEstand I")

In [45]:
result7

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,589.578504,552915.851731,743.583117,0.767749,test,RandForest SinEstand I
1,555.745954,499932.590823,707.059114,0.794244,train,RandForest SinEstand I


In [46]:
best_model8, result8 = rg_sinest.grid_s_forest(param4, "RandForest SinEstand II")

In [47]:
result8

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,573.234133,540373.892945,735.101281,0.773017,test,RandForest SinEstand II
1,539.134922,478625.892237,691.827935,0.803013,train,RandForest SinEstand II


In [48]:
param5 = {"max_depth": [22], 
        "max_features": [4],
        "min_samples_split": [8],
        "min_samples_leaf": [8]}

In [49]:
best_model9, result9 = rg_sinest.grid_s_forest(param5, "RandForest SinEstand III")

In [50]:
result9

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,579.155368,543103.892224,736.955828,0.771871,test,RandForest SinEstand III
1,527.373433,455565.603196,674.956001,0.812504,train,RandForest SinEstand III


In [51]:
best_model10, result10 = rg_est.grid_s_forest(param5, "RandForest Estand III")

In [52]:
result10

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,574.03303,538265.537789,733.665822,0.773903,test,RandForest Estand III
1,528.274165,457969.167272,676.734192,0.811515,train,RandForest Estand III


In [53]:
param6 = {"max_depth": [18,20], 
        "max_features": [3,4],
        "min_samples_split": [8, 15],
        "min_samples_leaf": [8, 15]}

In [54]:
best_model11, result11 = rg_est.grid_s_forest(param6, "RandForest Estand IV")

In [55]:
best_model11

In [56]:
result11

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,578.157615,537917.072027,733.428301,0.774049,test,RandForest Estand IV
1,524.631779,453632.996264,673.522825,0.813299,train,RandForest Estand IV


In [57]:
best_model12, result12 = rg_est.grid_s_decision(param5, "GradientBoosting Estand III")

In [58]:
result12

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,540.440509,495840.812779,704.15965,0.791723,test,GradientBoosting Estand III
1,92.625315,18258.51395,135.124069,0.992485,train,GradientBoosting Estand III


In [59]:
best_model13, result13 = rg_sinest.grid_s_decision(param5, "GradientBoosting SinEstand III")

In [60]:
result13

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,551.511822,523461.555319,723.506431,0.780121,test,GradientBoosting SinEstand III
1,93.72443,18736.201686,136.880246,0.992289,train,GradientBoosting SinEstand III


In [61]:
param7 = {"max_depth": [4], 
        "max_features": [3],
        "min_samples_split": [50],
        "min_samples_leaf": [10]}

In [62]:
best_model14, result14 = rg_est.grid_s_decision(param7, "GradientBoosting Estand IV")

In [63]:
best_model14

In [64]:
result14

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,534.971212,495642.189693,704.0186,0.791807,test,GradientBoosting Estand IV
1,403.168946,275056.155982,524.457964,0.886796,train,GradientBoosting Estand IV


In [112]:
param8 = {"max_depth": [4], 
        "max_features": [3],
        "min_samples_split": [40],
        "min_samples_leaf": [9]}

In [113]:
best_model15, result15 = rg_est.grid_s_decision(param8, "GradientBoosting Estand V")

In [87]:
best_model15

In [114]:
result15

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,540.459827,487648.047927,698.318013,0.795165,test,GradientBoosting Estand V
1,394.909889,263909.421862,513.721152,0.891383,train,GradientBoosting Estand V
