# DECISION TREE Y RANDOM FOREST
---

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('once')

In [4]:
df_est = pd.read_csv("datos/09-bikes_encoding_est_mediana.csv", index_col = 0)
df_est.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,registered
0,0,0,0,0,0,0,2,14.110847,-0.680818,1.267606,-0.363437,654
1,0,0,1,1,2,1,2,14.902598,-0.741507,0.480415,0.874656,670
2,0,0,1,1,2,1,3,8.050924,-1.750344,-1.376017,0.871424,1229
3,2,0,1,1,0,0,3,8.2,-1.610886,-0.27741,-0.365545,1454
4,2,0,2,1,0,1,3,9.305237,-1.505615,-1.378284,0.008358,1518


In [5]:
df_est.season.unique()

array([0, 2, 3])

In [6]:
df_est.isnull().sum()

season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
registered    0
dtype: int64

In [7]:
# Creamos una clase para poder hacer el decision tree y el random forest más rápido
class Regres_lineal:
    def __init__(self, dataf, col_pred):
        
        self.dataf = dataf
        self.col_pred = col_pred

    def tree_param(self):
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]

        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

        arbol = DecisionTreeRegressor(random_state =0)
    
        arbol.fit(x_train, y_train)

        max_feat = np.sqrt(len(x_train.columns))
        max_dep = arbol.tree_.max_depth
        return(max_feat, max_dep)

    def metricas(self, y_te, y_tr, y_te_pred, y_tr_pred, tipo_modelo):
        resultados = {'MAE': [mean_absolute_error(y_te, y_te_pred), mean_absolute_error(y_tr, y_tr_pred)],
                    'MSE': [mean_squared_error(y_te, y_te_pred), mean_squared_error(y_tr, y_tr_pred)],
                    'RMSE': [np.sqrt(mean_squared_error(y_te, y_te_pred)), np.sqrt(mean_squared_error(y_tr, y_tr_pred))],
                    'R2':  [r2_score(y_te, y_te_pred), r2_score(y_tr, y_tr_pred)],
                    "set": ["test", "train"]}
        df = pd.DataFrame(resultados)
        df["modelo"] = tipo_modelo
        return df

    def grid_s_decision(self, dict, model):
        self.dict = dict
        self.model = model
        
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        
        gs = GridSearchCV(
            estimator= GradientBoostingRegressor(), 
            param_grid= self.dict, 
            cv=10, 
            verbose=-1,
            return_train_score = True,
            scoring="neg_mean_squared_error")
        gs.fit(x_train, y_train)
        mejor_modelo = gs.best_estimator_
        y_pred_test_dt2 = mejor_modelo.predict(x_test)
        y_pred_train_dt2 = mejor_modelo.predict(x_train)
        dt_results = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, self.model)
        return mejor_modelo, dt_results

    def grid_s_forest(self, dict, model):
        self.dict = dict
        self.model = model
        
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        
        gs = GridSearchCV(
            estimator= RandomForestRegressor(), 
            param_grid= self.dict, 
            cv=10, 
            verbose=-1,
            return_train_score = True,
            scoring="neg_mean_squared_error")
        gs.fit(x_train, y_train)
        mejor_modelo = gs.best_estimator_
        y_pred_test_dt2 = mejor_modelo.predict(x_test)
        y_pred_train_dt2 = mejor_modelo.predict(x_train)
        dt_results = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, self.model)
        return mejor_modelo, dt_results

In [8]:

rg_est = Regres_lineal(df_est, "registered")

In [9]:
features_est, depth_est = rg_est.tree_param()

In [10]:
features_est #El número máx será 3

3.3166247903554

In [11]:
depth_est # La profundidad máxima será 22

21

In [12]:
param1 = {"max_depth": [16, 18, 20], 
        "max_features": [3,4],
        "min_samples_split": [50, 100],
        "min_samples_leaf": [50,100]} 

In [13]:
best_model1, result1 = rg_est.grid_s_decision(param1, "GradientBoosting Estand I")

In [14]:
best_model1

In [15]:
result1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,518.08456,420070.289479,648.128297,0.805394,test,GradientBoosting Estand I
1,475.313919,378641.69622,615.338684,0.848123,train,GradientBoosting Estand I


In [16]:
param2 = {"max_depth": [4,6,8,10, 12, 14, 16, 18, 20], 
        "max_features": [2, 3,4],
        "min_samples_split": [10, 20, 40, 50, 60],
        "min_samples_leaf": [10, 20, 40, 50, 60]} 

In [17]:
best_model2, result2 = rg_est.grid_s_decision(param2, "GradientBoosting Estand II")

In [18]:
best_model2

In [19]:
result2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,555.12463,484985.665556,696.409122,0.775321,test,GradientBoosting Estand II
1,409.400866,290007.797384,538.52372,0.883675,train,GradientBoosting Estand II


In [20]:
param3 = {"max_depth": [6, 8, 10, 12, 14, 16, 18], 
        "max_features": [3,4],
        "min_samples_split": [10, 20, 40],
        "min_samples_leaf": [10, 20, 40]} 

In [21]:
best_model3, result3 = rg_est.grid_s_forest(param3, "RandForest Estand I")

In [22]:
best_model3

In [23]:
result3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,570.373264,526960.401713,725.920382,0.755875,test,RandForest Estand I
1,557.750645,511128.619918,714.932598,0.794981,train,RandForest Estand I


In [24]:
param4 = {"max_depth": [12, 14, 16, 18], 
        "max_features": [4,5],
        "min_samples_split": [10, 20, 40, 50],
        "min_samples_leaf": [10, 20, 40, 50]} 

In [25]:
best_model4, result4 = rg_est.grid_s_forest(param4, "RandForest Estand II")

In [26]:
best_model4

In [27]:
result4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,569.057537,527088.187709,726.008394,0.755816,test,RandForest Estand II
1,549.377105,502798.206885,709.082652,0.798323,train,RandForest Estand II


In [28]:
df_sinest = pd.read_csv("datos/08-bikes_encoding_sinest_mediana.csv", index_col = 0)
df_sinest.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,registered
0,0,0,0,0,0,0,2,14.110847,18.18125,80.5833,10.749882,654
1,0,0,1,1,2,1,2,14.902598,17.68695,69.6087,16.652113,670
2,0,0,1,1,2,1,3,8.050924,9.47025,43.7273,16.636703,1229
3,2,0,1,1,0,0,3,8.2,10.6061,59.0435,10.739832,1454
4,2,0,2,1,0,1,3,9.305237,11.4635,43.6957,12.5223,1518


In [29]:
rg_sinest = Regres_lineal(df_sinest, "registered")

In [30]:
features_sinest, depth_sinest = rg_sinest.tree_param()

In [31]:
features_sinest #El número máx será 3

3.3166247903554

In [32]:
depth_sinest # La profundidad máxima será 22

21

In [33]:
best_model5, result5 = rg_sinest.grid_s_decision(param1, "GradientBoosting SinEstand I")

In [62]:
best_model5

In [34]:
result5

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,523.921648,451293.945631,671.784151,0.790929,test,GradientBoosting SinEstand I
1,488.857285,395599.134112,628.966719,0.841321,train,GradientBoosting SinEstand I


In [35]:
best_model6, result6 = rg_sinest.grid_s_decision(param2, "GradientBoosting SinEstand II")

In [61]:
best_model6

In [36]:
result6

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,538.681797,451792.055005,672.154785,0.790699,test,GradientBoosting SinEstand II
1,412.533642,295223.781809,543.344993,0.881583,train,GradientBoosting SinEstand II


In [37]:
best_model7, result7 = rg_sinest.grid_s_forest(param3, "RandForest SinEstand I")

In [38]:
result7

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,566.278453,518685.914569,720.198524,0.759709,test,RandForest SinEstand I
1,560.802561,517203.377556,719.168532,0.792545,train,RandForest SinEstand I


In [39]:
best_model8, result8 = rg_sinest.grid_s_forest(param4, "RandForest SinEstand II")

In [40]:
result8

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,558.847373,518577.927042,720.12355,0.759759,test,RandForest SinEstand II
1,548.210704,500030.761436,707.128532,0.799433,train,RandForest SinEstand II


In [41]:
param5 = {"max_depth": [22], 
        "max_features": [4],
        "min_samples_split": [8],
        "min_samples_leaf": [8]}

In [42]:
best_model9, result9 = rg_sinest.grid_s_forest(param5, "RandForest SinEstand III")

In [43]:
result9

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,569.254381,525185.071327,724.696537,0.756698,test,RandForest SinEstand III
1,534.509586,474740.867707,689.014418,0.809577,train,RandForest SinEstand III


In [44]:
best_model10, result10 = rg_est.grid_s_forest(param5, "RandForest Estand III")

In [45]:
result10

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,574.757589,534156.191695,730.859899,0.752542,test,RandForest Estand III
1,535.304957,481582.626811,693.961546,0.806832,train,RandForest Estand III


In [46]:
param6 = {"max_depth": [18,20], 
        "max_features": [3,4],
        "min_samples_split": [8, 15],
        "min_samples_leaf": [8, 15]}

In [47]:
best_model11, result11 = rg_est.grid_s_forest(param6, "RandForest Estand IV")

In [48]:
best_model11

In [49]:
result11

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,576.927478,528619.886226,727.062505,0.755107,test,RandForest Estand IV
1,534.149255,475817.911209,689.795558,0.809145,train,RandForest Estand IV


In [50]:
best_model12, result12 = rg_est.grid_s_decision(param5, "GradientBoosting Estand III")

In [59]:
best_model12

In [51]:
result12

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,564.677578,541206.064443,735.667088,0.749276,test,GradientBoosting Estand III
1,92.874601,18227.101239,135.007782,0.992689,train,GradientBoosting Estand III


In [52]:
best_model13, result13 = rg_sinest.grid_s_decision(param5, "GradientBoosting SinEstand III")

In [53]:
result13

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,587.95201,555889.092003,745.579702,0.742474,test,GradientBoosting SinEstand III
1,91.975441,17972.147606,134.060239,0.992791,train,GradientBoosting SinEstand III


In [60]:
best_model13

In [54]:
param7 = {"max_depth": [18,20], 
        "max_features": [4],
        "min_samples_split": [8, 16],
        "min_samples_leaf": [8]}

In [55]:
best_model14, result14 = rg_est.grid_s_decision(param7, "GradientBoosting Estand IV")

In [56]:
best_model14

In [57]:
result14

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,575.277974,525663.66041,725.026662,0.756476,test,GradientBoosting Estand IV
1,91.177103,17707.076208,133.067938,0.992898,train,GradientBoosting Estand IV


In [77]:
param17 = {"max_depth": [10], 
        "max_features": [3],
        "min_samples_split": [50],
        "min_samples_leaf": [10]}

best_model15, result15 = rg_est.grid_s_decision(param17, "GradientBoosting Estand IV")

result15

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,547.711454,496937.367849,704.937847,0.769784,test,GradientBoosting Estand IV
1,274.658506,135655.240835,368.314052,0.945587,train,GradientBoosting Estand IV


In [85]:
param17 = {"max_depth": [16], 
        "max_features": [5],
        "min_samples_split": [50],
        "min_samples_leaf": [30]}

best_model16, result16 = rg_sinest.grid_s_decision(param17, "GradientBoosting SinEstand IV")

result16

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,507.387185,412276.037283,642.087251,0.809005,test,GradientBoosting SinEstand IV
1,381.582046,248433.480424,498.431019,0.900351,train,GradientBoosting SinEstand IV


In [None]:
best_model14