# DECISION TREE Y RANDOM FOREST
---

In [307]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

import pickle 
import warnings
warnings.filterwarnings('once')

In [222]:
df_est = pd.read_csv("datos/07-bikes_encoding_est_sinout.csv", index_col = 0)
df_est.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,registered
0,0,0,0,0,0,1,2,14.110847,-0.691698,1.254842,-0.360109,654
1,0,0,0,7,2,2,2,14.902598,-0.752316,0.465708,0.867064,670
2,0,0,1,7,2,2,4,8.050924,-1.759976,-1.395307,0.86386,1229
3,1,0,1,7,1,1,4,8.2,-1.620681,-0.293988,-0.362199,1454
4,1,0,2,7,1,2,4,9.305237,-1.515533,-1.397579,0.008406,1518


In [223]:
df_est.drop("temp", axis = 1, inplace= True)

In [224]:
df_est.season.unique()

array([0, 1, 2], dtype=int64)

In [225]:
df_est.isnull().sum()

season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
atemp         0
hum           0
windspeed     0
registered    0
dtype: int64

In [226]:
# Creamos una clase para poder hacer el decision tree y el random forest más rápido
class Regres_lineal:
    def __init__(self, dataf, col_pred):
        
        self.dataf = dataf
        self.col_pred = col_pred

    def tree_param(self):
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]

        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

        arbol = DecisionTreeRegressor(random_state =0)
    
        arbol.fit(x_train, y_train)

        max_feat = np.sqrt(len(x_train.columns))
        max_dep = arbol.tree_.max_depth
        return(max_feat, max_dep)

    def metricas(self, y_te, y_tr, y_te_pred, y_tr_pred, tipo_modelo):
        resultados = {'MAE': [mean_absolute_error(y_te, y_te_pred), mean_absolute_error(y_tr, y_tr_pred)],
                    'MSE': [mean_squared_error(y_te, y_te_pred), mean_squared_error(y_tr, y_tr_pred)],
                    'RMSE': [np.sqrt(mean_squared_error(y_te, y_te_pred)), np.sqrt(mean_squared_error(y_tr, y_tr_pred))],
                    'R2':  [r2_score(y_te, y_te_pred), r2_score(y_tr, y_tr_pred)],
                    "set": ["test", "train"]}
        df = pd.DataFrame(resultados)
        df["modelo"] = tipo_modelo
        return df

    def grid_s_decision(self, dict, model):
        self.dict = dict
        self.model = model
        
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        
        gs = GridSearchCV(
            estimator= GradientBoostingRegressor(), 
            param_grid= self.dict, 
            cv=10, 
            verbose=-1,
            return_train_score = True,
            scoring="neg_mean_squared_error")
        gs.fit(x_train, y_train)
        mejor_modelo = gs.best_estimator_
        y_pred_test_dt2 = mejor_modelo.predict(x_test)
        y_pred_train_dt2 = mejor_modelo.predict(x_train)
        dt_results = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, self.model)
        return mejor_modelo, dt_results

    def grid_s_forest(self, dict, model):
        self.dict = dict
        self.model = model
        
        X = self.dataf.drop(self.col_pred, axis = 1)
        y = self.dataf[self.col_pred]
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        
        gs = GridSearchCV(
            estimator= RandomForestRegressor(), 
            param_grid= self.dict, 
            cv=10, 
            verbose=-1,
            return_train_score = True,
            scoring="neg_mean_squared_error")
        gs.fit(x_train, y_train)
        mejor_modelo = gs.best_estimator_
        y_pred_test_dt2 = mejor_modelo.predict(x_test)
        y_pred_train_dt2 = mejor_modelo.predict(x_train)
        dt_results = self.metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, self.model)
        return mejor_modelo, dt_results

In [227]:

rg_est = Regres_lineal(df_est, "registered")

In [228]:
features_est, depth_est = rg_est.tree_param()

In [229]:
features_est #El número máx será 3

3.1622776601683795

In [230]:
depth_est # La profundidad máxima será 22

18

In [231]:
param1 = {"max_depth": [5], 
        "max_features": [4],
        "min_samples_split": [50],
        "min_samples_leaf": [11]} 

In [232]:
best_model1, result1 = rg_est.grid_s_decision(param1, "GradientBoosting Estand I")

In [233]:
best_model1

In [234]:
result1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,564.452507,515552.932398,718.020148,0.783443,test,GradientBoosting Estand I
1,370.339247,232962.386792,482.661773,0.90412,train,GradientBoosting Estand I


In [235]:
param2 = {"max_depth": [8], 
        "max_features": [4],
        "min_samples_split": [40],
        "min_samples_leaf": [5]} 

In [236]:
best_model2, result2 = rg_est.grid_s_decision(param2, "GradientBoosting Estand II")

In [237]:
best_model2

In [238]:
result2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,546.906135,514019.430489,716.951484,0.784088,test,GradientBoosting Estand II
1,244.406219,103427.506522,321.601472,0.957433,train,GradientBoosting Estand II


In [239]:
param31 = {"max_depth": [4], 
        "max_features": [6],
        "min_samples_split": [50],
        "min_samples_leaf": [10]} 

In [240]:
best_model31, result31 = rg_est.grid_s_decision(param31, "GradientBoosting Estand III")

In [241]:
best_model31

In [308]:
with open ("datos/model_prediccion.pkl", "wb") as fp:
    pickle.dump(best_model31, fp)

In [242]:
result31

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,567.584823,517269.79968,719.21471,0.782722,test,GradientBoosting Estand III
1,400.000568,267136.076189,516.852083,0.890055,train,GradientBoosting Estand III


In [243]:
param32 = {"max_depth": [7], 
        "max_features": [5],
        "min_samples_split": [50],
        "min_samples_leaf": [10]} 

In [244]:
best_model32, result32 = rg_est.grid_s_decision(param32, "GradientBoosting Estand IV")

In [245]:
best_model32

In [246]:
result32

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,550.414196,506533.852446,711.711917,0.787232,test,GradientBoosting Estand IV
1,305.289044,159606.084557,399.507302,0.934311,train,GradientBoosting Estand IV


In [247]:
param3 = {"max_depth": [10], 
        "max_features": [5],
        "min_samples_split": [60],
        "min_samples_leaf": [15]} 

In [248]:
best_model3, result3 = rg_est.grid_s_forest(param3, "RandForest Estand I")

In [249]:
best_model3

In [250]:
result3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,662.713716,665470.325239,815.763645,0.720471,test,RandForest Estand I
1,651.620775,660120.493954,812.477996,0.728316,train,RandForest Estand I


In [251]:
param4 = {"max_depth": [4], 
        "max_features": [3],
        "min_samples_split": [10],
        "min_samples_leaf": [50]} 

In [252]:
best_model4, result4 = rg_est.grid_s_forest(param4, "RandForest Estand II")

In [253]:
best_model4

In [254]:
result4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,844.838712,1031229.0,1015.494659,0.566835,test,RandForest Estand II
1,826.377248,1042658.0,1021.106051,0.570876,train,RandForest Estand II


In [255]:
df_sinest = pd.read_csv("datos/06-bikes_encoding_sinest_sinout.csv", index_col = 0)
df_sinest.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,registered
0,0,0,0,0,0,1,2,14.110847,18.18125,80.5833,10.749882,654
1,0,0,0,7,2,2,2,14.902598,17.68695,69.6087,16.652113,670
2,0,0,1,7,2,2,4,8.050924,9.47025,43.7273,16.636703,1229
3,1,0,1,7,1,1,4,8.2,10.6061,59.0435,10.739832,1454
4,1,0,2,7,1,2,4,9.305237,11.4635,43.6957,12.5223,1518


In [256]:
df_sinest.drop("temp", axis= 1, inplace=True)

In [257]:
rg_sinest = Regres_lineal(df_sinest, "registered")

In [258]:
features_sinest, depth_sinest = rg_sinest.tree_param()

In [259]:
features_sinest #El número máx será 3

3.1622776601683795

In [260]:
depth_sinest # La profundidad máxima será 22

18

In [261]:
best_model5, result5 = rg_sinest.grid_s_decision(param1, "GradientBoosting SinEstand I")

In [262]:
result5

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,549.077826,514883.825812,717.554058,0.783724,test,GradientBoosting SinEstand I
1,374.592709,239196.244361,489.076931,0.901555,train,GradientBoosting SinEstand I


In [263]:
best_model6, result6 = rg_sinest.grid_s_decision(param2, "GradientBoosting SinEstand II")

In [264]:
result6

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,550.093791,510498.836824,714.492013,0.785566,test,GradientBoosting SinEstand II
1,241.795444,99570.760946,315.54835,0.95902,train,GradientBoosting SinEstand II


In [265]:
best_model7, result7 = rg_sinest.grid_s_forest(param3, "RandForest SinEstand I")

In [266]:
result7

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,671.068206,666565.993652,816.434929,0.720011,test,RandForest SinEstand I
1,652.691788,659672.210146,812.202075,0.7285,train,RandForest SinEstand I


In [267]:
best_model8, result8 = rg_sinest.grid_s_forest(param4, "RandForest SinEstand II")

In [268]:
result8

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,828.11129,994847.1,997.420215,0.582117,test,RandForest SinEstand II
1,815.407659,1012561.0,1006.261114,0.583262,train,RandForest SinEstand II


In [292]:
param5 = {"max_depth": [4], 
        "max_features": [6],
        "min_samples_split": [50],
        "min_samples_leaf": [10]}

In [270]:
best_model9, result9 = rg_sinest.grid_s_forest(param5, "RandForest SinEstand III")

In [271]:
result9

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,603.122138,574542.496175,757.985815,0.758665,test,RandForest SinEstand III
1,537.485207,468344.206483,684.356783,0.807245,train,RandForest SinEstand III


In [293]:
best_model10, result10 = rg_est.grid_s_forest(param5, "RandForest Estand III")

In [294]:
result10

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,643.559213,631563.684979,794.709812,0.734713,test,RandForest Estand III
1,630.98166,619449.713353,787.051277,0.745054,train,RandForest Estand III


In [274]:
param6 = {"max_depth": [18,20], 
        "max_features": [3,4],
        "min_samples_split": [8, 15],
        "min_samples_leaf": [8, 15]}

In [275]:
best_model11, result11 = rg_est.grid_s_forest(param6, "RandForest Estand IV")

In [276]:
best_model11

In [277]:
result11

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,603.561013,565292.389302,751.859288,0.76255,test,RandForest Estand IV
1,539.256861,467109.554529,683.454135,0.807753,train,RandForest Estand IV


In [304]:
best_model12, result12 = rg_est.grid_s_decision(param5, "GradientBoosting Estand III")

In [305]:
result12

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,544.19033,489101.476653,699.357903,0.794554,test,GradientBoosting Estand III
1,396.560299,264991.840242,514.773582,0.890938,train,GradientBoosting Estand III


In [280]:
best_model13, result13 = rg_sinest.grid_s_decision(param5, "GradientBoosting SinEstand III")

In [281]:
result13

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,557.424388,513469.601138,716.567932,0.784318,test,GradientBoosting SinEstand III
1,104.497214,23155.549108,152.169475,0.99047,train,GradientBoosting SinEstand III


In [282]:
param7 = {"max_depth": [4], 
        "max_features": [3],
        "min_samples_split": [50],
        "min_samples_leaf": [10]}

In [283]:
best_model14, result14 = rg_est.grid_s_decision(param7, "GradientBoosting Estand IV")

In [284]:
best_model14

In [285]:
result14

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,541.966598,505631.041807,711.077381,0.787611,test,GradientBoosting Estand IV
1,418.156422,296353.394897,544.3835,0.87803,train,GradientBoosting Estand IV


In [286]:
param8 = {"max_depth": [4], 
        "max_features": [3],
        "min_samples_split": [40],
        "min_samples_leaf": [9]}

In [287]:
best_model15, result15 = rg_est.grid_s_decision(param8, "GradientBoosting Estand V")

In [288]:
best_model15

In [289]:
result15

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,566.091662,530862.228087,728.602929,0.777013,test,GradientBoosting Estand V
1,413.285845,283483.715671,532.431888,0.883327,train,GradientBoosting Estand V
