In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Modelado y evaluación
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

# Barra de progreso de un proceso
# ------------------------------------------------------------------------------
from tqdm import tqdm

# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings('once')

In [2]:
df = pd.read_csv("../ficheros/diamons_enconding.csv", index_col = 0)
df.reset_index(drop= True, inplace= True) 
df.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_map,color_map,clarity_map
0,0.23,61.5,55.0,326,3.95,3.98,2.43,4,5,1
1,0.21,59.8,61.0,326,3.89,3.84,2.31,3,5,2
2,0.23,56.9,65.0,327,4.05,4.07,2.31,1,5,4
3,0.29,62.4,58.0,334,4.2,4.23,2.63,3,1,3
4,0.31,63.3,58.0,335,4.34,4.35,2.75,1,0,1


In [3]:
# Separamos nuestro dataframe en X e y

X = df.drop("price", axis = 1)
y = df["price"]

In [4]:
# y dividimos nuestros datos en train y test para poder evaluar la bondad de nuestro modelo

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [12]:
# Definicimos los hiperparámetro con los que queremos probar para el GridSearch
param = {"max_depth": [28,32,34], 
        "max_features": [3,4],
        "min_samples_split": [100, 500],
        "min_samples_leaf": [50,100]} 

In [13]:
gs_rf = GridSearchCV(
            estimator=RandomForestRegressor(), # tipo de modelo que queremos hacer
            param_grid= param, # que hiperparámetros queremos que testee
            cv=10, # crossvalidation que aprendimos en la lección de regresión lineal intro. 
            verbose=-1, # para que no nos printee ningún mensaje en pantalla
            return_train_score = True, # para que nos devuelva el valor de las métricas de set de datos de entrenamiento
            scoring="neg_mean_squared_error") # la métrica que queremos que nos devuelva

In [14]:
# ajustamos el modelo que acabamos de definir en el GridSearch

gs_rf.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [28, 32, 34], 'max_features': [3, 4],
                         'min_samples_leaf': [50, 100],
                         'min_samples_split': [100, 500]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=-1)

In [15]:
# al igual que el decision tree podemos sacar cual es nuestro mejor bosque
# En este caso,  nuestro mejor bosque es aquel que esta formado por arboles de 6 de profundidad, usa 4 variables y que tiene  un min_samples_leaf y un min_samples_split de 10. 

bosque = gs_rf.best_estimator_
bosque

RandomForestRegressor(max_depth=32, max_features=4, min_samples_leaf=50,
                      min_samples_split=100)

In [16]:
# Calculamos sus métricas
y_pred_test_rf = bosque.predict(x_test)
y_pred_train_rf = bosque.predict(x_train)

In [17]:
# Con esta función calculamos las métricas y las convertimos en dataframe
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    
    resultados = {'MAE': [mean_absolute_error(y_test, y_test_pred), mean_absolute_error(y_train, y_train_pred)],
                'MSE': [mean_squared_error(y_test, y_test_pred), mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(mean_squared_error(y_test, y_test_pred)), np.sqrt(mean_squared_error(y_train, y_train_pred))],
                'R2':  [r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [18]:
# sacamos las métricas para ver si hay overfitting o unerfitting, para modificar la profundidad en función de estos resultados
df_results = metricas(y_test, y_train, y_pred_test_rf, y_pred_train_rf, "Random Forest")
df_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,261.489592,210245.905054,458.525795,0.972326,test,Random Forest
1,252.056071,192547.883626,438.802784,0.974827,train,Random Forest


In [19]:
df_resultados_DT = pd.read_csv("../ficheros/resultados_diamons_DT.csv", index_col= 0)
df_resultados_DT

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,590.914063,713235.49913,844.532711,0.90612,test,Linear Regresion
1,592.658127,722504.482639,850.002637,0.905544,train,LinearRegression
0,267.159319,241786.78973,491.718202,0.968175,test,Decission Tree I
1,0.372776,42.506991,6.519739,0.999994,train,Decission Tree I
0,344.228556,370500.891199,608.687844,0.951233,test,Decision tree II
1,338.589162,353677.770194,594.708139,0.953762,train,Decision tree II
0,289.775443,252865.323427,502.85716,0.966717,test,Decision tree III
1,282.196022,235744.622123,485.535397,0.96918,train,Decision tree III
0,305.922207,281393.41124,530.465278,0.962962,test,Decision tree IV
1,289.919713,248256.639846,498.25359,0.967544,train,Decision tree IV


In [20]:
df_todos_resultados = pd.concat([df_results, df_resultados_DT], axis = 0)
df_todos_resultados

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,261.489592,210245.905054,458.525795,0.972326,test,Random Forest
1,252.056071,192547.883626,438.802784,0.974827,train,Random Forest
0,590.914063,713235.49913,844.532711,0.90612,test,Linear Regresion
1,592.658127,722504.482639,850.002637,0.905544,train,LinearRegression
0,267.159319,241786.78973,491.718202,0.968175,test,Decission Tree I
1,0.372776,42.506991,6.519739,0.999994,train,Decission Tree I
0,344.228556,370500.891199,608.687844,0.951233,test,Decision tree II
1,338.589162,353677.770194,594.708139,0.953762,train,Decision tree II
0,289.775443,252865.323427,502.85716,0.966717,test,Decision tree III
1,282.196022,235744.622123,485.535397,0.96918,train,Decision tree III


Lo que mejor nos parece son los resultados que hemos obtenido en el random forest. Tenemos en los dos casos un R2 alto y no hay mucha diferencia entre los RMSE.