# Random Forest

In [1]:
# Tratamiento de datos
import numpy as np
import pandas as pd
import sidetable as stb

# Gráficos
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Barra de progreso de un proceso
from tqdm import tqdm

# Configuración warnings
import warnings
warnings.filterwarnings('once')

import sys
sys.path.append("../../")
from src import funciones as fun
from src import variables as var

En el pair programming de hoy debéis usar el csv que guardastéis cuando hicistéis el pairprgramming de codificicación (este csv debería tener las variables estadandarizas).

Objetivo:
- Ajustar el modelo a un Random Forest



In [2]:
df = pd.read_pickle('../archivos/country_dummie.pkl')
df.head()

Unnamed: 0,basic_boxcox,basic,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,...,country_United Kingdom,country_United States,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Vietnam,country_Yemen,country_Zambia,country_Zimbabwe
0,16.684542,182.13,-0.219373,0.601852,2.395833,1.046243,1.304348,0.747273,4.294342,3.245552,...,0,0,0,0,0,0,0,0,0,0
1,10.443987,66.0,-0.350427,0.625,3.520833,-0.289017,-0.014493,-0.292727,0.338771,0.241993,...,0,0,0,0,0,0,0,0,0,0
2,9.945543,59.65,-0.552707,0.421296,1.791667,-0.398844,-0.384058,-0.490909,0.14856,0.014235,...,0,0,0,0,0,0,0,0,0,0
3,8.515986,43.57,-0.923077,-0.069444,-0.625,-0.514451,-0.934783,-0.505455,-0.656716,-1.081851,...,0,0,0,0,0,0,0,0,0,0
4,9.8165,58.07,-0.746439,-0.398148,-0.666667,-0.50289,-0.884058,-0.483636,-0.690038,-0.967972,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Separamos el dataframe entre variables predictoras (X) y variable dependiente (y)
X = df.drop(["basic", "basic_boxcox"], axis = 1)
y = df["basic"]

In [4]:
# dividimos ambos datos en 'train' y 'test' en una proporción 80-20:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [5]:
# primero generamos un modelo sin especificar hiperparámetros para después ir ajustando
bosque = RandomForestRegressor(random_state =0)
bosque

In [6]:
# lo entrenamos
bosque.fit(x_train, y_train)

In [7]:
y_test_rf = bosque.predict(x_test)
y_train_rf = bosque.predict(x_train)

- Extraer las métricas

In [8]:
rf1_results = fun.metricas(y_test, y_train, y_test_rf, y_train_rf, "Random Forest 1")
rf1_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,35.154235,2739.692802,52.342075,0.590405,test,Random Forest 1
1,13.010523,425.279137,20.622297,0.942228,train,Random Forest 1


*Comprobamos que nuestro modelo tiene un problema de **underfitting**, dado que todos los valores de las métricas son más altas en el set de test que en el de train, excepto en **R2**, que es significativamente más baja, lo cual indica que no está realizando unas predicciones ajustadas a los datos reales.*

- Probamos a mejorar nuestro modelo cambiado los hiperparámetros

In [9]:
max_features = np.sqrt(len(x_train.columns))
max_features

15.165750888103101

In [10]:
param1 = {"max_depth": [10, 12, 14], 
        "max_features": [1,2,3,4],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [11]:
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param1, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [12]:
gs_rf.fit(x_train, y_train)

In [13]:
bosque2 = gs_rf.best_estimator_
bosque2

In [14]:
y_test2_rf = bosque2.predict(x_test)
y_train2_rf = bosque2.predict(x_train)

In [15]:
rf2_results = fun.metricas(y_test, y_train, y_test2_rf, y_train2_rf, "Random Forest 2")
rf2_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,52.057315,4681.80708,68.423732,0.30005,test,Random Forest 2
1,53.148239,5272.539354,72.612253,0.283748,train,Random Forest 2


*Han salido una smetricas peores, lo que nos lleva a pensar que hay que aumentar los parámetros*

In [16]:
param2 = {"max_depth": [6,8,10,12], 
        "max_features": [3,4,5,6],
        "min_samples_split": [50, 100, 150],
        "min_samples_leaf": [5,10,20]} 

In [17]:
gs_rf2 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param2, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [18]:
gs_rf2.fit(x_train, y_train)

In [19]:
bosque3 = gs_rf2.best_estimator_
bosque3

In [20]:
y_test3_rf = bosque3.predict(x_test)
y_train3_rf = bosque3.predict(x_train)

In [21]:
rf3_results = fun.metricas(y_test, y_train, y_test3_rf, y_train3_rf, "Random Forest 3")
rf3_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,44.596653,3712.493618,60.930236,0.444967,test,Random Forest 3
1,44.734821,4040.150811,63.562181,0.451163,train,Random Forest 3


In [22]:
param3 = {"max_depth": [10, 12, 14], 
        "max_features": [12,13,14,15],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [23]:
gs_rf3 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param3, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [24]:
gs_rf3.fit(x_train, y_train)

In [25]:
bosque4 = gs_rf3.best_estimator_
bosque4

In [26]:
y_test4_rf = bosque4.predict(x_test)
y_train4_rf = bosque4.predict(x_train)

In [27]:
rf4_results = fun.metricas(y_test, y_train, y_test4_rf, y_train4_rf, "Random Forest 4")
rf4_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,40.524382,3272.328519,57.20427,0.510773,test,Random Forest 4
1,38.779794,3296.420203,57.41446,0.552195,train,Random Forest 4


In [28]:
param4 = {"max_depth": [10, 12, 14], 
        "max_features": [15,16,17,18],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [29]:
gs_rf4 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param4, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [30]:
gs_rf4.fit(x_train, y_train)

In [31]:
bosque5 = gs_rf4.best_estimator_
bosque5

In [32]:
y_test5_rf = bosque5.predict(x_test)
y_train5_rf = bosque5.predict(x_train)

In [33]:
rf5_results = fun.metricas(y_test, y_train, y_test5_rf, y_train5_rf, "Random Forest 5")
rf5_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,39.613336,3165.753788,56.265032,0.526707,test,Random Forest 5
1,37.079189,3068.723447,55.39606,0.583127,train,Random Forest 5


In [34]:
param5 = {"max_depth": [14, 16, 18], 
        "max_features": [18,19,20,21],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [35]:
gs_rf5 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param5, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [36]:
gs_rf5.fit(x_train, y_train)

In [37]:
bosque6 = gs_rf5.best_estimator_
bosque6

In [38]:
y_test6_rf = bosque6.predict(x_test)
y_train6_rf = bosque6.predict(x_train)

In [39]:
rf6_results = fun.metricas(y_test, y_train, y_test6_rf, y_train6_rf, "Random Forest 6")
rf6_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,38.989532,3097.08738,55.651481,0.536973,test,Random Forest 6
1,35.020849,2847.012615,53.357404,0.613245,train,Random Forest 6


In [40]:
param6 = {"max_depth": [16, 18, 20], 
        "max_features": [24, 26, 28, 30],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [41]:
gs_rf6 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param6, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [42]:
gs_rf6.fit(x_train, y_train)

In [43]:
bosque7 = gs_rf6.best_estimator_
bosque7

In [44]:
y_test7_rf = bosque7.predict(x_test)
y_train7_rf = bosque7.predict(x_train)

In [45]:
rf7_results = fun.metricas(y_test, y_train, y_test7_rf, y_train7_rf, "Random Forest 7")
rf7_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,37.895435,2997.269563,54.747325,0.551896,test,Random Forest 7
1,32.35905,2542.833054,50.426511,0.654567,train,Random Forest 7


In [46]:
param7 = {"max_depth": [14, 16, 18], 
        "max_features": [30, 34, 38, 40],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [47]:
gs_rf7 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param7, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [48]:
gs_rf7.fit(x_train, y_train)

In [49]:
bosque8 = gs_rf7.best_estimator_
bosque8

In [50]:
y_test8_rf = bosque8.predict(x_test)
y_train8_rf = bosque8.predict(x_train)

In [51]:
rf8_results = fun.metricas(y_test, y_train, y_test8_rf, y_train8_rf, "Random Forest 8")
rf8_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,37.826357,2996.179839,54.737371,0.552059,test,Random Forest 8
1,31.464302,2432.334828,49.318707,0.669578,train,Random Forest 8


*Tras varias pruebas con los hiperparámetros nos percatamos de que por el momento el modelo se vuelve más eficinente al aumentar el numero máximo de 'features', mientras que la profundidad máxima se mantiene en torno a los 14-16*

In [52]:
param8 = {"max_depth": [12, 14, 16], 
        "max_features": [40, 50, 60, 70],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [53]:
gs_rf8 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param8, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [54]:
gs_rf8.fit(x_train, y_train)

In [55]:
bosque9 = gs_rf8.best_estimator_
bosque9

In [56]:
y_test9_rf = bosque9.predict(x_test)
y_train9_rf = bosque9.predict(x_train)

In [57]:
rf9_results = fun.metricas(y_test, y_train, y_test9_rf, y_train9_rf, "Random Forest 9")
rf9_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,37.000899,2897.52565,53.828669,0.566808,test,Random Forest 9
1,29.543556,2232.694874,47.251401,0.696698,train,Random Forest 9


In [58]:
rf_metrics = pd.concat([rf1_results, rf2_results, rf3_results, rf4_results, rf5_results, rf6_results, rf7_results, rf8_results, rf9_results], axis=0)

In [59]:
rf_metrics

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,35.154235,2739.692802,52.342075,0.590405,test,Random Forest 1
1,13.010523,425.279137,20.622297,0.942228,train,Random Forest 1
0,52.057315,4681.80708,68.423732,0.30005,test,Random Forest 2
1,53.148239,5272.539354,72.612253,0.283748,train,Random Forest 2
0,44.596653,3712.493618,60.930236,0.444967,test,Random Forest 3
1,44.734821,4040.150811,63.562181,0.451163,train,Random Forest 3
0,40.524382,3272.328519,57.20427,0.510773,test,Random Forest 4
1,38.779794,3296.420203,57.41446,0.552195,train,Random Forest 4
0,39.613336,3165.753788,56.265032,0.526707,test,Random Forest 5
1,37.079189,3068.723447,55.39606,0.583127,train,Random Forest 5


In [62]:
# calculamos importancia de cada una de las variables incluidas en los dos modelos que mejor han funcionado: bosque y bosque9
importancia_predictores1 = pd.DataFrame(
                            {'predictor': x_train.columns,
                             'importancia': bosque.feature_importances_,
                             'modelo':'modelo 1'}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores1.sort_values(by=["importancia"], ascending=False, inplace = True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores1

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia,modelo
12,public_transport_ticket,0.187680,modelo 1
11,cigarettes_marlboro,0.145572,modelo 1
14,gasoline,0.077433,modelo 1
17,cinema,0.056991,modelo 1
15,internet,0.041233,modelo 1
...,...,...,...
134,country_Malawi,0.000000,modelo 1
78,country_Eritrea,0.000000,modelo 1
49,country_British Virgin Islands,0.000000,modelo 1
99,country_Guinea-Bissau,0.000000,modelo 1


In [63]:
importancia_predictores9 = pd.DataFrame(
                            {'predictor': x_train.columns,
                             'importancia': bosque.feature_importances_,
                             'modelo':'modelo 9'}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores9.sort_values(by=["importancia"], ascending=False, inplace = True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores9

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia,modelo
12,public_transport_ticket,0.187680,modelo 9
11,cigarettes_marlboro,0.145572,modelo 9
14,gasoline,0.077433,modelo 9
17,cinema,0.056991,modelo 9
15,internet,0.041233,modelo 9
...,...,...,...
134,country_Malawi,0.000000,modelo 9
78,country_Eritrea,0.000000,modelo 9
49,country_British Virgin Islands,0.000000,modelo 9
99,country_Guinea-Bissau,0.000000,modelo 9


In [64]:
comp_import = pd.concat([importancia_predictores1, importancia_predictores9], axis = 0)

In [65]:
comp_import

Unnamed: 0,predictor,importancia,modelo
12,public_transport_ticket,0.187680,modelo 1
11,cigarettes_marlboro,0.145572,modelo 1
14,gasoline,0.077433,modelo 1
17,cinema,0.056991,modelo 1
15,internet,0.041233,modelo 1
...,...,...,...
134,country_Malawi,0.000000,modelo 9
78,country_Eritrea,0.000000,modelo 9
49,country_British Virgin Islands,0.000000,modelo 9
99,country_Guinea-Bissau,0.000000,modelo 9


*Comprobamos que la variable de pais no está aportando información al modelo predictivo, si bien ninguna de nuestras variables predictoras tiene mucho peso lo que podría ser parte de la explicación de que ninguno de nuestros modelos alcance un elevado nivel de eficiencia*

- Debatid entre vosotras que modelo es mejor y por qué (basándose en las métricas)

*Para tomar esta decisión primero creamos un dataframe con las métricas de los distintos modelos generados*

In [76]:
reg_lin = pd.read_csv('../archivos/metricas_reg_lineal.csv', index_col=0)
reg_lin.drop('type', axis=1, inplace=True)

In [77]:
dec_tree = pd.read_csv('../archivos/metricas_decision_tree.csv', index_col=0)
dec_tree

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,50.91228,6436.314,80.22664,0.034782,test,Decission Tree Label
1,6.476005000000001e-17,1.1261509999999999e-30,1.061203e-15,1.0,train,Decission Tree Label
0,49.47905,5654.155,75.19411,0.15468,test,Decission Tree Dummie 1
1,1.988088e-17,2.825243e-31,5.315302e-16,1.0,train,Decission Tree Dummie 1
0,58.03948,5461.671,73.90312,0.183458,test,Decission Tree 2 Dummie
1,59.50146,6052.654,77.7988,0.177773,train,Decission Tree 2 Dummie
0,47.99936,6235.455,78.96489,0.064904,test,Decission Tree Orden 1
1,2.7267390000000002e-17,3.874929e-31,6.224893e-16,1.0,train,Decission Tree Orden 1


In [73]:
total_metrics = pd.concat([reg_lin,dec_tree, rf1_results, rf9_results], axis=0)

In [78]:
total_metrics

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,2.20727,7.893886,2.809606,0.540264,test,Linear Regresion
1,2.284685,8.472299,2.910721,0.527266,train,Linear Regression
0,1.624632,4.736822,2.176424,0.72413,test,Linear Regresion
1,1.489372,4.064977,2.016179,0.773184,train,Linear Regression
0,50.91228,6436.314,80.22664,0.034782,test,Decission Tree Label
1,6.476005000000001e-17,1.1261509999999999e-30,1.061203e-15,1.0,train,Decission Tree Label
0,49.47905,5654.155,75.19411,0.15468,test,Decission Tree Dummie 1
1,1.988088e-17,2.825243e-31,5.315302e-16,1.0,train,Decission Tree Dummie 1
0,58.03948,5461.671,73.90312,0.183458,test,Decission Tree 2 Dummie
1,59.50146,6052.654,77.7988,0.177773,train,Decission Tree 2 Dummie


*Podemos concluir que las que los modelos con mayor capacidad predictiva son los realizados con el método Random Forest. No siendo tan clara la diferencia entre ellos, si bien observamos un amenor diferencia entre los valores R2 en el modelo Random Forest 9.*

