In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

  ### DECISION TREE

In [2]:
df_casual = pd.read_pickle('datos/bikes_casual_estand.pkl')
df_casual.head()

Unnamed: 0,temperatura,velocidad_viento,estacion,año,mes,festivo,dia_semana,dia_laboral,tiempo,cliente_casual
1,-1.636215,0.744965,1.0,1.0,1.0,1,1.5,1.5,3,131.0
2,-1.616347,-0.390073,1.0,1.0,1.5,3,1.0,1.0,3,120.0
3,-1.469045,-0.046981,1.0,1.0,3.5,3,1.0,1.0,4,108.0
4,-1.592588,-1.302236,2.0,1.0,3.0,3,1.5,1.5,4,82.0
5,-1.635352,-0.281358,2.0,1.0,3.5,3,1.0,1.0,4,88.0


In [3]:
# Dividimos los datos y ajustamos el modelo

X = df_casual.drop("cliente_casual", axis = 1)
y = df_casual["cliente_casual"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

arbol = DecisionTreeRegressor(random_state =0)

arbol.fit(x_train, y_train)

y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [4]:
# Sacamos el nº de variables predictoras(3) y la profundidad(22)

max_features = np.sqrt(len(x_train.columns))
print(max_features)

print(arbol.tree_.max_depth)

3.0
22


In [5]:
param = {"max_depth": [4, 8, 6, 10, 12],  
        "max_features": [1,2,3],      
        "min_samples_split": [10, 30, 50, 100],
        "min_samples_leaf": [10, 30, 50, 100]} 

In [6]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [7]:
gs.fit(x_train, y_train)

mejor_modelo = gs.best_estimator_
mejor_modelo

KeyboardInterrupt: 

In [None]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [None]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [None]:
dt_results = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree")
dt_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,410.888148,355329.62737,596.095317,0.094906,test,Decision tree
1,363.158136,255451.412441,505.422014,0.474757,train,Decision tree


> Las métricas sacadas en el modelo de Decision Tree son muy inferiores a 0,7, por lo que pasamos a realizar el modelo Random Forest para poder mejorarlas.

  ### RANDOM FOREST

In [None]:
forest = RandomForestRegressor(random_state =0)

forest.fit(x_train, y_train)

y_pred_test_rf = forest.predict(x_test)
y_pred_train_rf= forest.predict(x_train)

In [None]:
rf_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,373.025753,270097.957711,519.709494,0.312008,test,Random Forest
1,135.910464,37427.625672,193.462207,0.923044,train,Random Forest


> Con el modelo Random Forestr tenemos mejores datos, no obstante, siguen siendo bastante malos por lo que procedemos a revisar los hiperparámetros.

In [None]:
gs = GridSearchCV(
            estimator=RandomForestRegressor(), 
            param_grid= param, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [None]:
gs.fit(x_train, y_train)

In [None]:
mejor_modelo_rf = gs.best_estimator_
mejor_modelo_rf