# DECISION TREE Y RANDOM FOREST
---

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('once')

In [17]:
df_est = pd.read_csv("datos/05-bikes_encoding_est.csv", index_col = 0)
df_est.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,atemp,hum,windspeed,registered
0,0,0,0,1,0,0,1,-0.691698,1.254842,-0.360109,654
1,0,0,1,0,2,1,1,-0.752316,0.465708,0.867064,670
2,0,0,1,0,2,1,2,-1.759976,-1.395307,0.86386,1229
3,1,0,1,0,0,0,2,-1.620681,-0.293988,-0.362199,1454
4,1,0,2,0,0,1,2,-1.515533,-1.397579,0.008406,1518


In [18]:
df_est.isnull().sum()

season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
atemp         0
hum           0
windspeed     0
registered    0
dtype: int64

In [19]:
# Creamos una clase para poder hacer el decision tree y el random forest más rápido
class Regres_lineal:
    def __init__(self, dataf, col_pred):
        
        self.dataf = dataf
        self.col_pred = col_pred

    def tree_param(self):
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]

        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

        arbol = DecisionTreeRegressor(random_state =0)
    
        arbol.fit(x_train, y_train)

        max_feat = np.sqrt(len(x_train.columns))
        max_dep = arbol.tree_.max_depth
        return(max_feat, max_dep)

    def metricas(self, y_te, y_tr, y_te_pred, y_tr_pred, tipo_modelo):
        resultados = {'MAE': [mean_absolute_error(y_te, y_te_pred), mean_absolute_error(y_tr, y_tr_pred)],
                    'MSE': [mean_squared_error(y_te, y_te_pred), mean_squared_error(y_tr, y_tr_pred)],
                    'RMSE': [np.sqrt(mean_squared_error(y_te, y_te_pred)), np.sqrt(mean_squared_error(y_tr, y_tr_pred))],
                    'R2':  [r2_score(y_te, y_te_pred), r2_score(y_tr, y_tr_pred)],
                    "set": ["test", "train"]}
        df = pd.DataFrame(resultados)
        df["modelo"] = tipo_modelo
        return df

    def grid_s_decision(self, dict, model):
        self.dict = dict
        self.model = model
        
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        
        gs = GridSearchCV(
            estimator= DecisionTreeRegressor(), 
            param_grid= self.dict, 
            cv=10, 
            verbose=-1,
            return_train_score = True,
            scoring="neg_mean_squared_error")
        gs.fit(x_train, y_train)
        mejor_modelo = gs.best_estimator_
        y_pred_test_dt2 = mejor_modelo.predict(x_test)
        y_pred_train_dt2 = mejor_modelo.predict(x_train)
        dt_results = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, self.model)
        return mejor_modelo, dt_results

    def grid_s_forest(self, dict, model):
        self.dict = dict
        self.model = model
        
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        
        gs = GridSearchCV(
            estimator= DecisionTreeRegressor(), 
            param_grid= self.dict, 
            cv=10, 
            verbose=-1,
            return_train_score = True,
            scoring="neg_mean_squared_error")
        gs.fit(x_train, y_train)
        mejor_modelo = gs.best_estimator_
        y_pred_test_dt2 = mejor_modelo.predict(x_test)
        y_pred_train_dt2 = mejor_modelo.predict(x_train)
        dt_results = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, self.model)
        return mejor_modelo, dt_results

In [20]:

rg_est = Regres_lineal(df_est, "registered")

In [21]:
features_est, depth_est = rg_est.tree_param()

In [22]:
features_est #El número máx será 3

3.1622776601683795

In [23]:
depth_est # La profundidad máxima será 22

22

In [24]:
param1 = {"max_depth": [16, 18, 20], 
        "max_features": [3,4],
        "min_samples_split": [50, 100],
        "min_samples_leaf": [50,100]} 

In [25]:
best_model1, result1 = rg_est.grid_s_decision(param1, "DecTree Estand I")

In [26]:
best_model1

In [27]:
result1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,818.911805,958043.1,978.796765,0.597577,test,DecTree Estand I
1,846.768477,1084407.0,1041.348534,0.553693,train,DecTree Estand I


In [32]:
param2 = {"max_depth": [4,6,8,10, 12, 14, 16, 18, 20], 
        "max_features": [2, 3,4],
        "min_samples_split": [10, 20, 40, 50, 60],
        "min_samples_leaf": [10, 20, 40, 50, 60]} 

In [33]:
best_model2, result2 = rg_est.grid_s_decision(param2, "DecTree Estand II")

In [34]:
best_model2

In [35]:
result2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,729.004709,990470.667614,995.223928,0.583955,test,DecTree Estand II
1,646.655301,686308.875862,828.437611,0.717537,train,DecTree Estand II


In [36]:
param3 = {"max_depth": [6, 8, 10, 12, 14, 16, 18], 
        "max_features": [3,4],
        "min_samples_split": [10, 20, 40],
        "min_samples_leaf": [10, 20, 40]} 

In [54]:
best_model3, result3 = rg_est.grid_s_forest(param3, "RandForest Estand I")

In [55]:
best_model3

In [56]:
result3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,846.382108,1174267.0,1083.636009,0.506752,test,RandForest Estand I
1,732.879274,915570.1,956.854259,0.623181,train,RandForest Estand I


In [41]:
param4 = {"max_depth": [12, 14, 16, 18], 
        "max_features": [4,5],
        "min_samples_split": [10, 20, 40, 50],
        "min_samples_leaf": [10, 20, 40, 50]} 

In [57]:
best_model4, result4 = rg_est.grid_s_forest(param4, "RandForest Estand II")

In [58]:
best_model4

In [59]:
result4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,625.92612,666415.644077,816.342847,0.720074,test,RandForest Estand II
1,536.162959,519182.131844,720.542942,0.786321,train,RandForest Estand II


In [45]:
df_sinest = pd.read_csv("datos/04-bikes_encoding_sinest.csv", index_col = 0)
df_sinest.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,atemp,hum,windspeed,registered
0,0,0,0,1,0,0,1,18.18125,80.5833,10.749882,654
1,0,0,1,0,2,1,1,17.68695,69.6087,16.652113,670
2,0,0,1,0,2,1,2,9.47025,43.7273,16.636703,1229
3,1,0,1,0,0,0,2,10.6061,59.0435,10.739832,1454
4,1,0,2,0,0,1,2,11.4635,43.6957,12.5223,1518


In [46]:
rg_sinest = Regres_lineal(df_sinest, "registered")

In [47]:
features_sinest, depth_sinest = rg_sinest.tree_param()

In [48]:
features_sinest #El número máx será 3

3.1622776601683795

In [49]:
depth_sinest # La profundidad máxima será 22

22

In [50]:
best_model5, result5 = rg_sinest.grid_s_decision(param1, "DecTree SinEstand I")

In [51]:
result5

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,820.988324,1091229.0,1044.619188,0.541632,test,DecTree SinEstand I
1,811.624896,1102107.0,1049.812879,0.546408,train,DecTree SinEstand I


In [52]:
best_model6, result6 = rg_sinest.grid_s_decision(param2, "DecTree SinEstand II")

In [53]:
result6

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,807.375825,1131005.0,1063.487147,0.524924,test,DecTree SinEstand II
1,676.944797,794552.7,891.376869,0.672988,train,DecTree SinEstand II


In [60]:
best_model7, result7 = rg_sinest.grid_s_forest(param3, "RandForest SinEstand I")

In [61]:
result7

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,748.605174,911238.443834,954.588102,0.617237,test,RandForest SinEstand I
1,642.073934,694288.829311,833.239959,0.714253,train,RandForest SinEstand I


In [62]:
best_model8, result8 = rg_sinest.grid_s_forest(param4, "RandForest SinEstand II")

In [63]:
result8

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,779.297474,1012193.0,1006.07822,0.574831,test,RandForest SinEstand II
1,581.362588,584189.1,764.322668,0.759567,train,RandForest SinEstand II


In [64]:
param5 = {"max_depth": [22], 
        "max_features": [4],
        "min_samples_split": [8],
        "min_samples_leaf": [8]}

In [65]:
best_model9, result9 = rg_sinest.grid_s_forest(param5, "RandForest SinEstand III")

In [66]:
result9

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,849.0485,1128998.0,1062.543348,0.525767,test,RandForest SinEstand III
1,627.041464,692354.5,832.078393,0.715049,train,RandForest SinEstand III


In [67]:
best_model10, result10 = rg_est.grid_s_forest(param5, "RandForest Estand III")

In [68]:
result10

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,628.242922,671020.96846,819.158696,0.718139,test,RandForest Estand III
1,549.712761,529505.676528,727.671407,0.782072,train,RandForest Estand III


In [73]:
param6 = {"max_depth": [18,20], 
        "max_features": [3,4],
        "min_samples_split": [8, 15],
        "min_samples_leaf": [8, 15]}

In [74]:
best_model11, result11 = rg_est.grid_s_forest(param6, "RandForest Estand IV")

In [75]:
best_model11

In [76]:
result11

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,730.548591,871404.967424,933.490743,0.633969,test,RandForest Estand IV
1,700.544422,850547.572346,922.251361,0.649942,train,RandForest Estand IV


In [77]:
best_model12, result12 = rg_est.grid_s_decision(param5, "Decision Estand III")

In [78]:
result12

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,687.644868,786111.440357,886.629258,0.669796,test,Decision Estand III
1,541.19365,516839.638863,718.915599,0.787285,train,Decision Estand III


In [79]:
best_model13, result13 = rg_sinest.grid_s_decision(param5, "Decision SinEstand III")

In [80]:
result13

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,674.581336,749212.771442,865.570778,0.685295,test,Decision Estand III
1,534.556869,482993.122686,694.977066,0.801216,train,Decision Estand III


In [82]:
param7 = {"max_depth": [18,20], 
        "max_features": [4],
        "min_samples_split": [8, 16],
        "min_samples_leaf": [8]}

In [85]:
best_model14, result14 = rg_est.grid_s_decision(param7, "Decision Estand IV")

In [87]:
best_model14

In [86]:
result14

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,752.19763,923329.21196,960.900209,0.612158,test,Decision Estand IV
1,608.384794,627582.211735,792.200866,0.741707,train,Decision Estand IV
