In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

  ### DECISION TREE

In [2]:
df_casual = pd.read_pickle('../datos/bikes_casual_estand2.pkl')
df_casual.head()

Unnamed: 0,temperatura,velocidad_viento,estacion,año,mes,festivo,dia_semana,dia_laboral,tiempo,cliente_casual
1,-1.636215,0.744965,1,1,1.0,1,2.0,2,3,131.0
2,-1.616347,-0.390073,1,1,1.5,3,1.0,1,3,120.0
3,-1.469045,-0.046981,1,1,3.0,3,1.0,1,4,108.0
4,-1.592588,-1.302236,2,1,3.0,3,2.0,2,4,82.0
5,-1.635352,-0.281358,2,1,5.0,3,1.0,1,4,88.0


In [3]:
# Dividimos los datos y ajustamos el modelo

X = df_casual.drop("cliente_casual", axis = 1)
y = df_casual["cliente_casual"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [3]:


arbol = DecisionTreeRegressor(random_state =0)

arbol.fit(x_train, y_train)

y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [4]:
# Sacamos el nº de variables predictoras(3) y la profundidad(22)

max_features = np.sqrt(len(x_train.columns))
print(max_features)

print(arbol.tree_.max_depth)

3.0
24


In [4]:
param_rf = {"max_depth": [6, 7, 8, 9, 10],  
        "max_features": [2, 3, 4, 5, 6],      
        "min_samples_split": [10, 20, 30, 50],
        "min_samples_leaf": [10, 15, 20, 30]}

In [5]:
param = {"max_depth": [4, 8, 6, 10, 12],  
        "max_features": [1,2,3],      
        "min_samples_split": [10, 30, 50, 100],
        "min_samples_leaf": [10, 30, 50, 100]} 

In [7]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param_rf, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [None]:
gs.fit(x_train, y_train)

In [None]:
mejor_modelo = gs.best_estimator_
mejor_modelo

In [None]:
y_pred_test_dt2 = mejor_modelo.predict(x_test)
y_pred_train_dt2 = mejor_modelo.predict(x_train)

In [3]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [None]:
dt_results = metricas(y_test, y_train, y_pred_test_dt2, y_pred_train_dt2, "Decision tree")
dt_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,315.809589,215244.709759,463.944727,0.559571,test,Decision tree
1,303.488477,188206.729484,433.827995,0.595844,train,Decision tree


> Las métricas sacadas en el modelo de Decision Tree son muy inferiores a 0,7, por lo que pasamos a realizar el modelo Random Forest para poder mejorarlas.

  ### RANDOM FOREST

In [7]:
forest = RandomForestRegressor(random_state =42)

forest.fit(x_train, y_train)

y_pred_test_rf = forest.predict(x_test)
y_pred_train_rf= forest.predict(x_train)

In [8]:
rf_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,310.641986,207963.976276,456.030675,0.574469,test,Random Forest
1,129.355386,33350.290215,182.620618,0.928384,train,Random Forest


> Con el modelo Random Forestr tenemos mejores datos, no obstante, siguen siendo bastante malos por lo que procedemos a revisar los hiperparámetros.

- he probado con estos hiperparámetros:
param2 = {"max_depth": [4, 8, 6, 10],  
        "max_features": [1,2,3],      
        "min_samples_split": [10, 15, 20, 30],
        "min_samples_leaf": [10, 15, 20, 30]} 
los voy a modificar para probar otra vez
las métricas era
R2 TEST > 0.43
R2 TRAIN > 0.57

In [None]:
break

In [19]:
param = {"max_depth": [6, 7, 8, 9, 10],  
        "max_features": [2, 3, 4, 5, 6],      
        "min_samples_split": [10, 20, 30, 50],
        "min_samples_leaf": [10, 15, 20, 30],
        "random_state":  [42]}

In [20]:
gs = GridSearchCV(
            estimator=RandomForestRegressor(), 
            param_grid= param, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

- con los hiperparámetros de Cassia

In [21]:
gs.fit(x_train, y_train)

In [22]:
mejor_modelo_rf = gs.best_estimator_
mejor_modelo_rf

In [23]:
y_pred_test_rf2 = mejor_modelo_rf.predict(x_test)
y_pred_train_rf2= mejor_modelo_rf.predict(x_train)

In [24]:
rf_results2 = metricas(y_test, y_train, y_pred_test_rf2, y_pred_train_rf2, "Random Forest 2")
rf_results2

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,298.856024,187091.382892,432.540614,0.617178,test,Random Forest 2
1,297.939492,174350.125907,417.552543,0.6256,train,Random Forest 2


In [23]:
df_casual.head(1)

Unnamed: 0,temperatura,velocidad_viento,estacion,año,mes,festivo,dia_semana,dia_laboral,tiempo,cliente_casual
1,-1.636215,0.744965,1,1,1.0,1,2.0,2,3,131.0


In [85]:
df_casual2 = pd.read_pickle('../datos/bikes_casual_estand_prueba.pkl')

In [86]:
X = df_casual2.drop(["cliente_casual", "año"], axis = 1)
y = df_casual2["cliente_casual"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

(max_depth=8, max_features=5, min_samples_leaf=10,
                      min_samples_split=20)

In [87]:
forest8 = RandomForestRegressor(max_depth=9, max_features=7, min_samples_leaf=9,
                      min_samples_split=20)
forest8.fit(x_train, y_train)

In [88]:
y_pred_test_rf8 = forest8.predict(x_test)
y_pred_train_rf8= forest8.predict(x_train)

In [89]:
rf_results8 = metricas(y_test, y_train, y_pred_test_rf8, y_pred_train_rf8, "Random Forest 8")
rf_results8

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,335.526123,224407.528407,473.716717,0.540822,test,Random Forest 8
1,321.766074,202527.466206,450.030517,0.565092,train,Random Forest 8


In [24]:
df_casual.drop(['velocidad_viento'], axis=1, inplace=True)

In [29]:
df_casual.drop(['estacion'], axis=1, inplace=True)

In [30]:
X = df_casual.drop("cliente_casual", axis = 1)
y = df_casual["cliente_casual"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [34]:
forest7 = RandomForestRegressor(max_depth=8, max_features=5, min_samples_leaf=10,
                      min_samples_split=20)
forest7.fit(x_train, y_train)


In [35]:
y_pred_test_rf7 = forest7.predict(x_test)
y_pred_train_rf7= forest7.predict(x_train)

In [36]:
rf_results7 = metricas(y_test, y_train, y_pred_test_rf7, y_pred_train_rf7, "Random Forest 7")
rf_results7

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,333.917256,206718.992071,454.663603,0.473446,test,Random Forest 7
1,314.31449,195504.842981,442.159296,0.598016,train,Random Forest 7


> ### Métricas malísimas de nuevo !! REvisar!!


SACAMOS CON RANDOM FOREST NUEVAS MÉTRICAS PERO ELIMINANDO UNA VARIABLE PREDICTORA

In [9]:
df_casual.head(2)

Unnamed: 0,temperatura,velocidad_viento,estacion,año,mes,festivo,dia_semana,dia_laboral,tiempo,cliente_casual
1,-0.42569,0.68755,1,1,1.0,1,2.0,2,3,331
2,-0.94997,0.68521,1,1,1.5,3,1.0,1,3,131


In [4]:
X1 = df_casual.drop(["cliente_casual", 'velocidad_viento', 'estacion'], axis = 1)
y1 = df_casual["cliente_casual"]

DECISION TREE

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [11]:


arbol = DecisionTreeRegressor(random_state =0)

arbol.fit(x_train, y_train)

y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [12]:
dt_results01 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decision tree")
dt_results01

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,440.239726,414148.565068,643.543755,0.152579,test,Decision tree
1,3.818182,1083.283019,32.913265,0.997674,train,Decision tree


In [37]:
param_dt = {"max_depth": [6, 7, 8, 9, 10],  
        "max_features": [2, 3, 4, 5, 6],      
        "min_samples_split": [10, 20, 30, 50],
        "min_samples_leaf": [10, 15, 20, 30]}

In [38]:
gs = GridSearchCV(
            estimator=DecisionTreeRegressor(), 
            param_grid= param_dt, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [39]:
gs.fit(x_train, y_train)

In [40]:
mejor_modelo = gs.best_estimator_
mejor_modelo

In [41]:
y_pred_test_dt0 = mejor_modelo.predict(x_test)
y_pred_train_dt0 = mejor_modelo.predict(x_train)

In [45]:
dt_results0 = metricas(y_test, y_train, y_pred_test_dt0, y_pred_train_dt0, "Decision tree1")
dt_results0

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,296.13897,181935.939416,426.539493,0.627727,test,Decision tree1
1,290.635401,176425.460748,420.030309,0.621144,train,Decision tree1


In [54]:
max_features = np.sqrt(len(x_train.columns))
print(max_features)

print(arbol.tree_.max_depth)

2.6457513110645907
23


RANDOM FOREST

In [7]:
forest = RandomForestRegressor(random_state =65)

forest.fit(x_train, y_train)

y_pred_test_rf = forest.predict(x_test)
y_pred_train_rf= forest.predict(x_train)

In [8]:
rf_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
rf_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,348.462994,239029.39531,488.906326,0.391145,test,Random Forest
1,133.20859,38212.103147,195.479163,0.921431,train,Random Forest


- HE PROBADO A QUITAR UN PAR DE VARIABLES PREDICTORAS Y A PROBAR CON LOS HIPERPARÁMETROS DEL DECISION TREE PERO LAS MÉTRICAS EN EL RANDOM 2 SON PEORES: 0.62 - 0.59
- QUIZÁ MODIFICANDO LOS HIPERPARÁMETROS DE NUEVO PODAMOS MEJORARLAS O QUITANDO ALGUNA VARIABLE PREDICTORA MÁS PARA QUE EL PRIMER RANDOM MEJORE UN POCO. NO SÉ.

In [11]:
param = {"max_depth": [6, 7, 8, 9, 10],  
        "max_features": [2, 3, 4, 5],      
        "min_samples_split": [10, 20, 30, 50],
        "min_samples_leaf": [10, 12, 16, 20],
        "random_state":  [65]}

In [12]:
gs1 = GridSearchCV(
            estimator=RandomForestRegressor(), 
            param_grid= param, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error")

In [13]:
gs1.fit(x_train, y_train)

In [14]:
mejor_modelo_rf1 = gs1.best_estimator_
mejor_modelo_rf1

In [15]:
y_pred_test_rf2 = mejor_modelo_rf1.predict(x_test)
y_pred_train_rf2= mejor_modelo_rf1.predict(x_train)

In [16]:
rf_results3 = metricas(y_test, y_train, y_pred_test_rf2, y_pred_train_rf2, "Random Forest 1")
rf_results3

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,335.740774,208492.346172,456.609621,0.468929,test,Random Forest 1
1,318.174196,199512.275685,446.66797,0.589776,train,Random Forest 1


PROBAMOS ELIMINANDO OTRA VARIABLE PREDICTORA

In [13]:
df_casual = pd.read_pickle('../datos/bikes_casual_estand2.pkl')

In [8]:
df_casual.head(2)

Unnamed: 0,temperatura,velocidad_viento,estacion,año,mes,festivo,dia_semana,dia_laboral,tiempo,cliente_casual
1,-1.636215,0.744965,1,1,1.0,1,2.0,2,3,131.0
2,-1.616347,-0.390073,1,1,1.5,3,1.0,1,3,120.0


In [14]:
X2= df_casual.drop(["cliente_casual", 'velocidad_viento', 'estacion'], axis = 1)
y2 = df_casual["cliente_casual"]

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [16]:


arbol = DecisionTreeRegressor(random_state =0)

arbol.fit(x_train, y_train)

y_pred_test_dt = arbol.predict(x_test)
y_pred_train_dt = arbol.predict(x_train)

In [17]:
dt_results4 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decision tree4")
dt_results4

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,431.821918,405078.599315,636.457854,0.171137,test,Decision tree4
1,7.178388,2621.798456,51.2035,0.99437,train,Decision tree4


In [28]:
forest = RandomForestRegressor(random_state =65, max_depth=10, max_features=5, min_samples_split=40, min_samples_leaf=10)

forest.fit(x_train, y_train)

y_pred_test_rf4 = forest.predict(x_test)
y_pred_train_rf4= forest.predict(x_train)

In [27]:
rf_results4r = metricas(y_test, y_train, y_pred_test_rf4, y_pred_train_rf4, "Random Forest4")
rf_results4r

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,345.902739,218987.926516,467.961458,0.442195,test,Random Forest4
1,342.354053,225459.069724,474.825304,0.536426,train,Random Forest4
