In [36]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [37]:
df = pd.read_csv("datos/bikes_stand.csv", index_col=0)
df.head(5)

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,registered
0,2018-01-01,winter,0,1,1,Monday,0,2,-0.486274,0.855582,-0.208909,654
1,2018-01-02,winter,0,1,0,Tuesday,1,2,-0.42569,0.332722,0.68755,670
2,2018-01-03,winter,0,1,0,Wednesday,1,1,-0.94997,-0.900337,0.68521,1229
3,2018-01-04,winter,0,1,0,Thursday,1,1,-0.938563,-0.170632,-0.210435,1454
4,2018-01-05,winter,0,1,0,Friday,1,1,-0.853992,-0.901843,0.060294,1518


In [38]:
lista_categoricas = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

In [39]:
df["season_encoding"] = df["season"].map({"winter": 1, "autumn": 2, "spring": 2, "summer": 2})
df['mnth_encoding'] = df['mnth'].map({1:1, 2:1, 3:1, 4:2, 5:2, 6:2, 7:2, 8:2, 9:2, 10:2, 11:2, 12:2 })
df['holiday_encoding'] = df['holiday'].map({0:1, 1:0})
df['weekday_encoding'] = df['weekday'].map({'Monday':1, 'Tuesday':1, 'Wednesday':2, 'Thursday':2, 'Friday':2, 'Saturday':2, 'Sunday':2})
df['weathersit_encoding'] = df['weathersit'].map({1:3, 2:2.5, 3:1})

In [40]:
oh = OneHotEncoder()
transformados_wd = oh.fit_transform(df[["workingday"]])
oh_df = pd.DataFrame(transformados_wd.toarray())
oh_df.columns = oh.get_feature_names_out()
df = pd.concat([df,oh_df],axis=1)

In [41]:
df.drop(columns=['dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit'], inplace=True)

In [42]:
df.head(1)

Unnamed: 0,temp,hum,windspeed,registered,season_encoding,mnth_encoding,holiday_encoding,weekday_encoding,weathersit_encoding,workingday_0,workingday_1
0,-0.486274,0.855582,-0.208909,654,1,1,0,1,2.5,1.0,0.0


In [43]:
X = df.drop("registered", axis = 1)
y = df["registered"]

In [44]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 23)

In [45]:
arbol = DecisionTreeRegressor(random_state = 23)

arbol.fit(x_train, y_train)

In [46]:
# fig = plt.figure(figsize = (10,6))
# tree.plot_tree(arbol, feature_names = x_train.columns, filled = True)
# plt.show()

In [47]:
max_features = np.sqrt(len(x_train.columns))
max_features

3.1622776601683795

In [48]:
print(arbol.tree_.max_depth)

19


In [49]:
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [50]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [51]:
dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decision Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1039.979452,1755755.0,1325.048986,0.148978,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I


In [52]:
param = {"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10], 
        "max_features": [1,2,3],
        "min_samples_split": [10, 50, 150, 200, 250, 300, 350, 400],
        "min_samples_leaf": [10, 50, 150, 200, 250, 300, 350, 400]} 

In [53]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [54]:
gs.fit(x_train, y_train)

In [55]:
mejor_modelo = gs.best_estimator_
mejor_modelo

In [56]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [57]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1002.271692,1450769.0,1204.47861,0.296806,test,Decision tree II
1,914.683268,1197037.0,1094.092005,0.521944,train,Decision tree II


In [58]:
df_decision_results = pd.concat([dt_results1, dt_results2], axis = 0)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1039.979452,1755755.0,1325.048986,0.148978,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I
0,1002.271692,1450769.0,1204.47861,0.296806,test,Decision tree II
1,914.683268,1197037.0,1094.092005,0.521944,train,Decision tree II


In [59]:
param2 = {"max_depth": [4, 5, 6, 7, 8, 9], 
        "max_features": [1,2,3],
        "min_samples_split": [5, 10, 20, 50, 150, 200],
        "min_samples_leaf": [5, 10, 20, 50, 150, 200]} 

In [60]:
gs2 = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param2, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [61]:
gs2.fit(x_train, y_train)

In [62]:
mejor_modelo2 = gs2.best_estimator_
mejor_modelo2

In [63]:
y_pred_test_dt3 = mejor_modelo2.predict(x_test)
y_pred_train_dt3 = mejor_modelo2.predict(x_train)

In [64]:
dt_results3 = metricas(y_test, y_train, y_pred_test_dt3, y_pred_train_dt3, "Decision tree III")
dt_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,947.678114,1208085.0,1099.129236,0.414436,test,Decision tree III
1,877.807606,1143607.0,1069.395606,0.543283,train,Decision tree III


In [65]:
df_decision_results = pd.concat([df_decision_results, dt_results3], axis = 0)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1039.979452,1755755.0,1325.048986,0.148978,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I
0,1002.271692,1450769.0,1204.47861,0.296806,test,Decision tree II
1,914.683268,1197037.0,1094.092005,0.521944,train,Decision tree II
0,947.678114,1208085.0,1099.129236,0.414436,test,Decision tree III
1,877.807606,1143607.0,1069.395606,0.543283,train,Decision tree III


In [71]:
bosque = RandomForestRegressor(max_depth=6, max_features=3, min_samples_leaf=5,min_samples_split=20, random_state = 23)
bosque.fit(x_train, y_train)
                

In [72]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [73]:
rf_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,919.527618,1138429.0,1066.971911,0.448198,test,Random Forest
1,882.21589,1043543.0,1021.539494,0.583245,train,Random Forest


In [74]:
results = pd.concat([df_decision_results, rf_results], axis = 0)
results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1039.979452,1755755.0,1325.048986,0.148978,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I
0,1002.271692,1450769.0,1204.47861,0.296806,test,Decision tree II
1,914.683268,1197037.0,1094.092005,0.521944,train,Decision tree II
0,947.678114,1208085.0,1099.129236,0.414436,test,Decision tree III
1,877.807606,1143607.0,1069.395606,0.543283,train,Decision tree III
0,919.527618,1138429.0,1066.971911,0.448198,test,Random Forest
1,882.21589,1043543.0,1021.539494,0.583245,train,Random Forest


In [75]:
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(), # tipo de modelo que queremos hacer
            param_grid= param2, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [76]:
gs_rf.fit(x_train, y_train)

In [77]:
bosque2 = gs_rf.best_estimator_
bosque2

In [78]:
y_pred_test_rf2 = bosque.predict(x_test)
y_pred_train_rf2 = bosque.predict(x_train)

In [79]:

dt_results_rf2 = metricas(y_test, y_train, y_pred_test_rf2, y_pred_train_rf2, "Random Forest 2")
dt_results_rf2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,919.527618,1138429.0,1066.971911,0.448198,test,Random Forest 2
1,882.21589,1043543.0,1021.539494,0.583245,train,Random Forest 2


In [80]:
results = pd.concat([results, dt_results_rf2], axis = 0)
results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1039.979452,1755755.0,1325.048986,0.148978,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I
0,1002.271692,1450769.0,1204.47861,0.296806,test,Decision tree II
1,914.683268,1197037.0,1094.092005,0.521944,train,Decision tree II
0,947.678114,1208085.0,1099.129236,0.414436,test,Decision tree III
1,877.807606,1143607.0,1069.395606,0.543283,train,Decision tree III
0,919.527618,1138429.0,1066.971911,0.448198,test,Random Forest
1,882.21589,1043543.0,1021.539494,0.583245,train,Random Forest
0,919.527618,1138429.0,1066.971911,0.448198,test,Random Forest 2
1,882.21589,1043543.0,1021.539494,0.583245,train,Random Forest 2
