# Desafío - Random Forest

### Ejercicio 1: Preparación del ambiente de trabajo

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
df = pd.read_csv('ames_housing.csv').drop('Unnamed: 0', axis=1)

In [3]:
df

Unnamed: 0,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,Land_Contour,Utilities,Lot_Config,...,Fence,Misc_Feature,Misc_Val,Mo_Sold,Year_Sold,Sale_Type,Sale_Condition,Sale_Price,Longitude,Latitude
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141,31770,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,,0,5,2010,WD,Normal,215000,-93.619754,42.054035
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80,11622,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,6,2010,WD,Normal,105000,-93.619756,42.053014
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81,14267,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,Gar2,12500,6,2010,WD,Normal,172000,-93.619387,42.052659
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93,11160,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,No_Fence,,0,4,2010,WD,Normal,244000,-93.617320,42.051245
4,Two_Story_1946_and_Newer,Residential_Low_Density,74,13830,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,3,2010,WD,Normal,189900,-93.638933,42.060899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,Split_or_Multilevel,Residential_Low_Density,37,7937,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,CulDSac,...,Good_Privacy,,0,3,2006,WD,Normal,142500,-93.604776,41.988964
2926,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,0,8885,Pave,No_Alley_Access,Slightly_Irregular,Low,AllPub,Inside,...,Minimum_Privacy,,0,6,2006,WD,Normal,131000,-93.602680,41.988314
2927,Split_Foyer,Residential_Low_Density,62,10441,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,Minimum_Privacy,Shed,700,7,2006,WD,Normal,132000,-93.606847,41.986510
2928,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,77,10010,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Fence,,0,4,2006,WD,Normal,170000,-93.600190,41.990921


### Ejercicio 2: Importación de archivos serializados

##### Cargo el modelo del desafio anterior

In [4]:
modelo_cargado = joblib.load('modelo_arboles_regresion.pkl')

In [5]:
modelo_cargado

DecisionTreeRegressor(max_depth=10, max_features=10)

In [6]:
df_analisis_anterior = df.loc[:,['Garage_Cars', 'Gr_Liv_Area', 'Year_Built', 'Total_Bsmt_SF',
       'First_Flr_SF', 'Fireplaces', 'Latitude', 'Lot_Area', 'Year_Remod_Add',
       'Bsmt_Unf_SF', 'Sale_Price']]

In [7]:
df_analisis_anterior

Unnamed: 0,Garage_Cars,Gr_Liv_Area,Year_Built,Total_Bsmt_SF,First_Flr_SF,Fireplaces,Latitude,Lot_Area,Year_Remod_Add,Bsmt_Unf_SF,Sale_Price
0,2,1656,1960,1080,1656,2,42.054035,31770,1960,441,215000
1,1,896,1961,882,896,0,42.053014,11622,1961,270,105000
2,1,1329,1958,1329,1329,0,42.052659,14267,1958,406,172000
3,2,2110,1968,2110,2110,2,42.051245,11160,1968,1045,244000
4,2,1629,1997,928,928,1,42.060899,13830,1998,137,189900
...,...,...,...,...,...,...,...,...,...,...,...
2925,2,1003,1984,1003,1003,0,41.988964,7937,1984,184,142500
2926,2,902,1983,864,902,0,41.988314,8885,1983,239,131000
2927,0,970,1992,912,970,0,41.986510,10441,1992,575,132000
2928,2,1389,1974,1389,1389,1,41.990921,10010,1975,195,170000


In [8]:
y=df_analisis_anterior['Sale_Price']
X=df_analisis_anterior.loc[:,'Garage_Cars':'Bsmt_Unf_SF']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=1545)

In [9]:
def metricas(y_test, x_test ,model):
    y_predict = model.predict(x_test)
    
    print(f'Test MSE: {mean_squared_error(y_test, y_predict).round(5)}')
    print(f'Test MAE: {median_absolute_error(y_test, y_predict).round(5)}')
    print(f'Test R2:  {r2_score(y_test, y_predict).round(5)}')

##### Ejecuto el modelo anterior para obtener sus metricas

In [10]:
model=modelo_cargado.fit(X_train, y_train)

In [11]:
metricas(y_test, X_test, model)

Test MSE: 1534120054.93426
Test MAE: 14833.33333
Test R2:  0.77008


### Ejercicio 3: Evaluación Random Forest

######  Modelo Random Forest sin modificar hiperparametros.

In [12]:
model_random_forest = RandomForestClassifier(oob_score=True ,random_state=45564).fit(X_train, y_train)

In [13]:
metricas(y_test, X_test, model_random_forest)

Test MSE: 1735118676.7787
Test MAE: 14000.0
Test R2:  0.73996


- Se observa una disminucion del valor MAE, lo que es positivo.
- El valor de R2 disminuye tambien, lo que indica que el modelo del desafio anterior se ajusta más a los datos observados.
- El valor de MSE aumenta, pero en este caso no es tan relevante esta metrica ya que se ve influenciada por el alto valor de la desviacion estandar.

 ### Ejercicio 4: Reporte las métricas de desempeño

#### El siguiente codigo no funciono, luego de probar varias posibilidades creo que puede ser por la cantidad de estimadores, al ser tan grande el pc se demora cada vez mas en ejecutar el modelo nuevamente, llegando al punto de no ejecutarse más. 

tmp_oob_none, tmp_oob_sqrt, tmp_oob_log2 = [], [], []
tmp_test_acc_none, tmp_test_acc_sqrt, tmp_test_acc_log = [], [], []
n_estimators = range( 20 , 1000 ,  25 )
for i in n_estimators:
#### Implementamos una variante con todos los atributos
    voting_rf_none = RandomForestClassifier(n_estimators=i,max_features= None ,
    oob_score=True ,
    random_state= 123).fit(X_train,y_train)
#### Implementamos una variante donde los atributos se escogen con sqrt
    voting_rf_sqrt = RandomForestClassifier(n_estimators= i,
    max_features= "sqrt" ,
    warm_start= True ,
    oob_score= True ,
    random_state= 123 ).fit(X_train, y_train)
#### Implementamos una variante donde los atributos se escogen con log
    voting_rf_log = RandomForestClassifier(n_estimators= i,
    max_features="log2" ,
    warm_start= True ,
    oob_score=True ,
    random_state= 123 ).fit(X_train, y_train)
#### Estimamos el error en OOB
    tmp_oob_none.append( 1 - voting_rf_none.oob_score_)
    tmp_oob_sqrt.append( 1 - voting_rf_sqrt.oob_score_)
    tmp_oob_log2.append( 1m - voting_rf_log.oob_score_)
#### Estimamos el error en la r2_score
    tmp_test_acc_none.append(r2_score(y_test,voting_rf_none.predict(X_test)))
    tmp_test_acc_sqrt.append(r2_score(y_test,voting_rf_sqrt.predict(X_test)))
    tmp_test_acc_log.append(r2_score(y_test,voting_rf_log.predict(X_test)))

### Ejercicio 5: Refactoriza el modelo

In [23]:
params = {'n_estimators':list(range(20,1000,80)), 'max_features': [None, 'sqrt','log2'], 'max_depth':[1,2,5,10]}

In [24]:
cv_rfc = GridSearchCV(RandomForestClassifier(oob_score=True, random_state=123), param_grid=params, cv=5, 
                      n_jobs=-1).fit(X_train, y_train)



KeyboardInterrupt: 

No logre hacerlo funcionar...