# Random Forest

In [1]:
# Tratamiento de datos
import numpy as np
import pandas as pd
import sidetable as stb

# Gráficos
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Barra de progreso de un proceso
from tqdm import tqdm

# Configuración warnings
import warnings
warnings.filterwarnings('once')

En el pair programming de hoy debéis usar el csv que guardastéis cuando hicistéis el pairprgramming de codificicación (este csv debería tener las variables estadandarizas).

Objetivo:
- Ajustar el modelo a un Random Forest



In [2]:
df = pd.read_csv('../archivos/country_dummie.csv', index_col=0)
df.head()

Unnamed: 0,basic,basic_boxcox,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,...,country_United Kingdom,country_United States,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Vietnam,country_Yemen,country_Zambia,country_Zimbabwe
0,182.13,16.271842,-0.210227,0.601852,2.395833,1.052023,1.311594,0.749091,4.259972,3.166667,...,0,0,0,0,0,0,0,0,0,0
1,66.0,10.244243,-0.340909,0.625,3.520833,-0.283237,-0.007246,-0.290909,0.341128,0.236111,...,0,0,0,0,0,0,0,0,0,0
2,59.65,9.760717,-0.542614,0.421296,1.791667,-0.393064,-0.376812,-0.489091,0.152682,0.013889,...,0,0,0,0,0,0,0,0,0,0
3,43.57,8.371859,-0.911932,-0.069444,-0.625,-0.508671,-0.927536,-0.503636,-0.645117,-1.055556,...,0,0,0,0,0,0,0,0,0,0
4,58.07,9.635477,-0.735795,-0.398148,-0.666667,-0.49711,-0.876812,-0.481818,-0.678129,-0.944444,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Eliminamos la columna de la variable dependiente normalizada
df.drop('basic_boxcox', axis=1, inplace=True)

In [4]:
# Separamos el dataframe entre variables predictoras (X) y variable dependiente (y)
X = df.drop("basic", axis = 1)
y = df["basic"]

In [5]:
# dividimos ambos datos en 'train' y 'test' en una proporción 80-20:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
# primero generamos un modelo sin especificar hiperparámetros para después ir ajustando
bosque = RandomForestRegressor(random_state =0)
bosque

In [7]:
# lo entrenamos
bosque.fit(x_train, y_train)

In [8]:
y_test_rf = bosque.predict(x_test)
y_train_rf = bosque.predict(x_train)

- Extraer las métricas

In [9]:
# Utilizamos una función para extraer las métricas
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [10]:
rf1_results = metricas(y_test, y_train, y_test_rf, y_train_rf, "Random Forest 1")
rf1_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,36.084168,2770.1459,52.632176,0.582184,test,Random Forest 1
1,13.673047,446.196117,21.123355,0.940229,train,Random Forest 1


*Comprobamos que nuestro modelo tiene un problema de **underfitting**, dado que todos los valores de las métricas son más altas en el set de test que en el de train, excepto en **R2**, que es significativamente más baja, lo cual indica que no está realizando unas predicciones ajustadas a los datos reales.*

- Probamos a mejorar nuestro modelo cambiado los hiperparámetros

In [11]:
max_features = np.sqrt(len(x_train.columns))
max_features

15.231546211727817

In [12]:
param1 = {"max_depth": [10, 12, 14], 
        "max_features": [1,2,3,4],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [13]:
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param1, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [14]:
gs_rf.fit(x_train, y_train)

In [15]:
bosque2 = gs_rf.best_estimator_
bosque2

In [16]:
y_test2_rf = bosque2.predict(x_test)
y_train2_rf = bosque2.predict(x_train)

In [17]:
rf2_results = metricas(y_test, y_train, y_test2_rf, y_train2_rf, "Random Forest 2")
rf2_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,54.160856,4897.922589,69.98516,0.261255,test,Random Forest 2
1,54.513363,5528.329885,74.35274,0.259445,train,Random Forest 2


*Han salido una smetricas peores, lo que nos lleva a pensar que hay que aumentar los parámetros*

In [18]:
param2 = {"max_depth": [12, 14, 16], 
        "max_features": [3,4,5,6],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [19]:
gs_rf2 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param2, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [20]:
gs_rf2.fit(x_train, y_train)

In [21]:
bosque3 = gs_rf2.best_estimator_
bosque3

In [23]:
y_test3_rf = bosque3.predict(x_test)
y_train3_rf = bosque3.predict(x_train)

In [24]:
rf3_results = metricas(y_test, y_train, y_test3_rf, y_train3_rf, "Random Forest 3")
rf3_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,49.546004,4207.727371,64.866998,0.365356,test,Random Forest 3
1,48.959933,4717.1514,68.681522,0.368107,train,Random Forest 3


In [25]:
param3 = {"max_depth": [10, 12, 14], 
        "max_features": [12,13,14,15],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [26]:
gs_rf3 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param3, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [27]:
gs_rf3.fit(x_train, y_train)

In [28]:
bosque4 = gs_rf3.best_estimator_
bosque4

In [29]:
y_test4_rf = bosque4.predict(x_test)
y_train4_rf = bosque4.predict(x_train)

In [30]:
rf4_results = metricas(y_test, y_train, y_test4_rf, y_train4_rf, "Random Forest 4")
rf4_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,42.37505,3286.727883,57.329991,0.504269,test,Random Forest 4
1,39.144521,3360.913303,57.973384,0.549784,train,Random Forest 4


In [31]:
param4 = {"max_depth": [10, 12, 14], 
        "max_features": [15,16,17,18],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [32]:
gs_rf4 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param4, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [33]:
gs_rf4.fit(x_train, y_train)

In [35]:
bosque5 = gs_rf4.best_estimator_
bosque5

In [36]:
y_test5_rf = bosque5.predict(x_test)
y_train5_rf = bosque5.predict(x_train)

In [37]:
rf5_results = metricas(y_test, y_train, y_test5_rf, y_train5_rf, "Random Forest 5")
rf5_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,41.12095,3180.259298,56.393788,0.520327,test,Random Forest 5
1,37.121055,3122.473469,55.879097,0.581725,train,Random Forest 5


In [41]:
param5 = {"max_depth": [14, 16, 18], 
        "max_features": [18,19,20,21],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [42]:
gs_rf5 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param5, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [43]:
gs_rf5.fit(x_train, y_train)

In [44]:
bosque6 = gs_rf5.best_estimator_
bosque6

In [45]:
y_test6_rf = bosque6.predict(x_test)
y_train6_rf = bosque6.predict(x_train)

In [46]:
rf6_results = metricas(y_test, y_train, y_test6_rf, y_train6_rf, "Random Forest 6")
rf6_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,40.553805,3125.400583,55.905282,0.528601,test,Random Forest 6
1,35.766069,2936.42218,54.188764,0.606647,train,Random Forest 6


In [47]:
param6 = {"max_depth": [16, 18, 20], 
        "max_features": [24, 26, 28, 30],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [48]:
gs_rf6 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param6, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [49]:
gs_rf6.fit(x_train, y_train)

In [50]:
bosque7 = gs_rf6.best_estimator_
bosque7

In [51]:
y_test7_rf = bosque7.predict(x_test)
y_train7_rf = bosque7.predict(x_train)

In [52]:
rf7_results = metricas(y_test, y_train, y_test7_rf, y_train7_rf, "Random Forest 7")
rf7_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,39.022565,2978.951862,54.579775,0.55069,test,Random Forest 7
1,32.66633,2565.964844,50.655354,0.656272,train,Random Forest 7


In [53]:
param7 = {"max_depth": [14, 16, 18], 
        "max_features": [30, 34, 38, 40],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [54]:
gs_rf7 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param7, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [55]:
gs_rf7.fit(x_train, y_train)

In [56]:
bosque8 = gs_rf7.best_estimator_
bosque8

In [57]:
y_test8_rf = bosque8.predict(x_test)
y_train8_rf = bosque8.predict(x_train)

In [58]:
rf8_results = metricas(y_test, y_train, y_test8_rf, y_train8_rf, "Random Forest 8")
rf8_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,38.107237,2869.48252,53.567551,0.567201,test,Random Forest 8
1,31.359198,2440.855771,49.405018,0.673032,train,Random Forest 8


*Tras varias pruebas con los hiperparámetros nos percatamos de que por el momento el modelo se vuelve más eficinente al aumentar el numero máximo de 'features', mientras que la profundidad máxima se mantiene en torno a los 14-16*

In [59]:
param8 = {"max_depth": [12, 14, 16], 
        "max_features": [40, 50, 60, 70],
        "min_samples_split": [10, 50, 100],
        "min_samples_leaf": [10,50,100]} 

In [60]:
gs_rf8 = GridSearchCV(
            estimator=RandomForestRegressor(),
            param_grid= param8, 
            cv=10, 
            verbose=-1, 
            return_train_score = True, 
            scoring="neg_mean_squared_error") 

In [61]:
gs_rf8.fit(x_train, y_train)

In [62]:
bosque9 = gs_rf8.best_estimator_
bosque9

In [63]:
y_test9_rf = bosque9.predict(x_test)
y_train9_rf = bosque9.predict(x_train)

In [64]:
rf9_results = metricas(y_test, y_train, y_test9_rf, y_train9_rf, "Random Forest 9")
rf9_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,37.481146,2833.94386,53.234799,0.572561,test,Random Forest 9
1,29.530587,2201.613631,46.921356,0.70508,train,Random Forest 9


In [66]:
rf_metrics = pd.concat([rf1_results, rf2_results, rf3_results, rf4_results, rf5_results, rf6_results, rf7_results, rf8_results, rf9_results], axis=0)

In [67]:
rf_metrics

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,36.084168,2770.1459,52.632176,0.582184,test,Random Forest 1
1,13.673047,446.196117,21.123355,0.940229,train,Random Forest 1
0,54.160856,4897.922589,69.98516,0.261255,test,Random Forest 2
1,54.513363,5528.329885,74.35274,0.259445,train,Random Forest 2
0,49.546004,4207.727371,64.866998,0.365356,test,Random Forest 3
1,48.959933,4717.1514,68.681522,0.368107,train,Random Forest 3
0,42.37505,3286.727883,57.329991,0.504269,test,Random Forest 4
1,39.144521,3360.913303,57.973384,0.549784,train,Random Forest 4
0,41.12095,3180.259298,56.393788,0.520327,test,Random Forest 5
1,37.121055,3122.473469,55.879097,0.581725,train,Random Forest 5


In [69]:
# calculamos importancia de cada una de las variables incluidas en los dos modelos que mejor han funcionado: bosque y bosque9
importancia_predictores1 = pd.DataFrame(
                            {'predictor': x_train.columns,
                             'importancia': bosque.feature_importances_,
                             'modelo':'modelo 1'}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores.sort_values(by=["importancia"], ascending=False, inplace = True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores1

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia,modelo
0,mcdonalds,1.449925e-02,modelo 1
1,cappuccino,2.039755e-02,modelo 1
2,milk,2.100868e-02,modelo 1
3,rice,2.070311e-02,modelo 1
4,eggs,3.201835e-02,modelo 1
...,...,...,...
227,country_Venezuela,1.406186e-05,modelo 1
228,country_Vietnam,5.136072e-06,modelo 1
229,country_Yemen,1.479876e-05,modelo 1
230,country_Zambia,6.816855e-07,modelo 1


In [70]:
importancia_predictores9 = pd.DataFrame(
                            {'predictor': x_train.columns,
                             'importancia': bosque.feature_importances_,
                             'modelo':'modelo 9'}
                            )


# ordenamos de mayor a menor los resultados
importancia_predictores.sort_values(by=["importancia"], ascending=False, inplace = True)

# printeamos los resultados
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores9

Importancia de los predictores en el modelo
-------------------------------------------


Unnamed: 0,predictor,importancia,modelo
0,mcdonalds,1.449925e-02,modelo 9
1,cappuccino,2.039755e-02,modelo 9
2,milk,2.100868e-02,modelo 9
3,rice,2.070311e-02,modelo 9
4,eggs,3.201835e-02,modelo 9
...,...,...,...
227,country_Venezuela,1.406186e-05,modelo 9
228,country_Vietnam,5.136072e-06,modelo 9
229,country_Yemen,1.479876e-05,modelo 9
230,country_Zambia,6.816855e-07,modelo 9


In [71]:
comp_import = pd.concat([importancia_predictores1, importancia_predictores9], axis = 0)

In [73]:
comp_import

Unnamed: 0,predictor,importancia,modelo
0,mcdonalds,1.449925e-02,modelo 1
1,cappuccino,2.039755e-02,modelo 1
2,milk,2.100868e-02,modelo 1
3,rice,2.070311e-02,modelo 1
4,eggs,3.201835e-02,modelo 1
...,...,...,...
227,country_Venezuela,1.406186e-05,modelo 9
228,country_Vietnam,5.136072e-06,modelo 9
229,country_Yemen,1.479876e-05,modelo 9
230,country_Zambia,6.816855e-07,modelo 9


*Comprobamos que la variable de pais no está aportando información al modelo predictivo, si bien ninguna de nuestras variables predictoras tiene mucho peso lo que podría ser parte de la explicación de que ninguno de nuestros modelos alcance un elevado nivel de eficiencia*

- Debatid entre vosotras que modelo es mejor y por qué (basándose en las métricas)

*Para tomar esta decisión primero creamos un dataframe con las métricas de los distintos modelos generados*

In [87]:
reg_lin = pd.read_csv('../archivos/metricas_reg_lineal.csv', index_col=0)
reg_lin.drop('type', axis=1, inplace=True)

In [88]:
dec_tree = pd.read_csv('../archivos/metrics_decis_tree.csv', index_col=0)
dec_tree

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,51.2442,7109.425,84.31741,-0.05046899,test,Decission Tree 1 Label
1,2.9854740000000004e-17,8.485227e-31,9.211529e-16,1.0,train,Decission Tree 1 Label
0,47.51383,5215.515,72.21852,0.2133529,test,Decission Tree 2 Dummie
1,6.530724e-18,4.640358e-32,2.154149e-16,1.0,train,Decission Tree 2 Dummie
0,64.14522,6633.48,81.44618,-0.0005163437,test,Decission Tree 2 Dummie
1,66.35504,7465.113,86.40088,-2.220446e-16,train,Decission Tree 2 Dummie


In [89]:
total_metrics = pd.concat([reg_lin,dec_tree, rf1_results, rf9_results], axis=0)

In [90]:
total_metrics

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,27879990.0,1.009673e+17,317753500.0,-6119675000000000.0,test,Linear Regresion
1,0.6545078,0.9089533,0.9533904,0.9456784,train,Linear Regression
0,51.2442,7109.425,84.31741,-0.05046899,test,Decission Tree 1 Label
1,2.9854740000000004e-17,8.485227e-31,9.211529e-16,1.0,train,Decission Tree 1 Label
0,47.51383,5215.515,72.21852,0.2133529,test,Decission Tree 2 Dummie
1,6.530724e-18,4.640358e-32,2.154149e-16,1.0,train,Decission Tree 2 Dummie
0,64.14522,6633.48,81.44618,-0.0005163437,test,Decission Tree 2 Dummie
1,66.35504,7465.113,86.40088,-2.220446e-16,train,Decission Tree 2 Dummie
0,36.08417,2770.146,52.63218,0.5821837,test,Random Forest 1
1,13.67305,446.1961,21.12335,0.9402292,train,Random Forest 1


*Podemos concluir que las que los modelos con mayor capacidad predictiva son los realizados con el método Random Forest. No siendo tan clara la diferencia entre ellos, si bien observamos un amenor diferencia entre los valores R2 en el modelo Random Forest 9.*

