In [90]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [91]:
df = pd.read_csv("datos/bikes_stand.csv", index_col=0)
df.head(5)

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,registered
0,2018-01-01,winter,0,1,1,Monday,0,2,-0.486274,0.855582,-0.208909,654
1,2018-01-02,winter,0,1,0,Tuesday,1,2,-0.42569,0.332722,0.68755,670
2,2018-01-03,winter,0,1,0,Wednesday,1,1,-0.94997,-0.900337,0.68521,1229
3,2018-01-04,winter,0,1,0,Thursday,1,1,-0.938563,-0.170632,-0.210435,1454
4,2018-01-05,winter,0,1,0,Friday,1,1,-0.853992,-0.901843,0.060294,1518


In [92]:
lista_categoricas = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

In [93]:
df['mnth_encoding'] = df['mnth'].map({1:1, 2: 1, 3:1, 4:2, 5:2, 6:2.5, 7:2.5, 8:2.5, 9:2.5, 10:2.5, 11:2, 12:2 })
df['holiday_encoding'] = df['holiday'].map({0:1, 1:0})
df['weekday_encoding'] = df['weekday'].map({'Monday':1, 'Tuesday':1, 'Wednesday':2, 'Thursday':2, 'Friday':2, 'Saturday':2, 'Sunday':2})
df["season_encoding"] = df["season"].map({"winter": 1, "autumn": 3, "spring": 3, "summer": 3})
df['weathersit_encoding'] = df['weathersit'].map({1:3, 2:2.5, 3:1})

In [94]:
oh = OneHotEncoder()
transformados_wd = oh.fit_transform(df[["workingday"]])
oh_df = pd.DataFrame(transformados_wd.toarray())
oh_df.columns = oh.get_feature_names_out()
df = pd.concat([df,oh_df],axis=1)

In [95]:
df.drop(columns=['dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit'], inplace=True)

In [96]:
X = df.drop("registered", axis = 1)
y = df["registered"]

In [97]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 23)

In [98]:
arbol = DecisionTreeRegressor(random_state = 23)

arbol.fit(x_train, y_train)

In [99]:
max_features = np.sqrt(len(x_train.columns))
max_features

3.1622776601683795

In [100]:
print(arbol.tree_.max_depth)

19


In [101]:
y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [102]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [103]:
dt_results1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decision Tree I")
dt_results1

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1132.938356,2135422.0,1461.308241,-0.035049,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I


In [104]:
param = {"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10], 
        "max_features": [1,2,3],
        "min_samples_split": [10, 50, 150, 200, 250, 300, 350, 400],
        "min_samples_leaf": [10, 50, 150, 200, 250, 300, 350, 400]} 

In [105]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [106]:
gs.fit(x_train, y_train)

In [107]:
mejor_modelo = gs.best_estimator_
mejor_modelo

In [108]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [109]:
dt_results2 = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree II")
dt_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1139.815349,1835115.0,1354.664121,0.110512,test,Decision tree II
1,877.868539,1141510.0,1068.414935,0.54412,train,Decision tree II


In [110]:
df_decision_results = pd.concat([dt_results1, dt_results2], axis = 0)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1132.938356,2135422.0,1461.308241,-0.035049,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I
0,1139.815349,1835115.0,1354.664121,0.110512,test,Decision tree II
1,877.868539,1141510.0,1068.414935,0.54412,train,Decision tree II


In [111]:
param2 = {"max_depth": [4, 5, 6, 7, 8, 9], 
        "max_features": [1,2,3],
        "min_samples_split": [5, 10, 15, 20, 50],
        "min_samples_leaf": [5, 10, 15, 20, 50]} 

In [112]:
gs2 = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param2, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [113]:
gs2.fit(x_train, y_train)

In [114]:
mejor_modelo2 = gs2.best_estimator_
mejor_modelo2

In [115]:
y_pred_test_dt3 = mejor_modelo2.predict(x_test)
y_pred_train_dt3 = mejor_modelo2.predict(x_train)

In [116]:
dt_results3 = metricas(y_test, y_train, y_pred_test_dt3, y_pred_train_dt3, "Decision tree III")
dt_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1036.511859,1535340.0,1239.08824,0.255814,test,Decision tree III
1,858.387938,1086342.0,1042.277444,0.566152,train,Decision tree III


In [117]:
df_decision_results = pd.concat([df_decision_results, dt_results3], axis = 0)
df_decision_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,1132.938356,2135422.0,1461.308241,-0.035049,test,Decision Tree I
1,0.0,0.0,0.0,1.0,train,Decision Tree I
0,1139.815349,1835115.0,1354.664121,0.110512,test,Decision tree II
1,877.868539,1141510.0,1068.414935,0.54412,train,Decision tree II
0,1036.511859,1535340.0,1239.08824,0.255814,test,Decision tree III
1,858.387938,1086342.0,1042.277444,0.566152,train,Decision tree III


In [121]:
bosque = RandomForestRegressor(max_depth=7, max_features=3, min_samples_leaf=5,min_samples_split=15, random_state = 23)
bosque.fit(x_train, y_train)

In [122]:
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [123]:
rf_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,926.31573,1142203.0,1068.739103,0.446369,test,Random Forest
1,840.749462,939655.5,969.358284,0.624734,train,Random Forest


In [124]:
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(), # tipo de modelo que queremos hacer
            param_grid= param2, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [125]:
gs_rf.fit(x_train, y_train)

In [126]:
bosque2 = gs_rf.best_estimator_
bosque2

In [127]:
y_pred_test_rf2 = bosque2.predict(x_test)
y_pred_train_rf2 = bosque2.predict(x_train)

In [128]:
dt_results_rf2 = metricas(y_test, y_train, y_pred_test_rf2, y_pred_train_rf2, "Random Forest II")
dt_results_rf2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,922.100292,1126865.0,1061.539031,0.453803,test,Random Forest II
1,820.471304,895176.8,946.137835,0.642497,train,Random Forest II
