In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import utils.featurizer as ft
import utils.encodeador as my_encoder

In [2]:
train = pd.read_csv("data/train.csv")

In [3]:
test = pd.read_csv("data/test.csv")

In [4]:
df_test = pd.read_csv("data/test.csv")

In [5]:
train = ft.featurizer(train)
test = ft.featurizer(test)

In [6]:
drop_cols = ['titulo','fecha', 'idzona','descripcion', 'direccion', 'lat', 'lng', 'id']

In [7]:
X = train.drop(['precio'] + drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)
y = train['precio']

In [8]:
X_oh, test = my_encoder.oneHotEncoder(X,test)

In [9]:
X_oh.shape

(240000, 603)

In [10]:
test.shape

(60000, 603)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_oh, y, test_size=0.25, random_state=42)

In [12]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10,100, 200, 300, 1000]
}

In [13]:
randomForest = RandomForestRegressor(**param_grid)

In [14]:
%%time
grid_search = GridSearchCV(estimator = randomForest, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error')

CPU times: user 269 µs, sys: 61 µs, total: 330 µs
Wall time: 88.9 µs


In [15]:
%%time
grid_search = GridSearchCV(estimator = randomForest, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring='neg_mean_absolute_error')

CPU times: user 116 µs, sys: 27 µs, total: 143 µs
Wall time: 39.3 µs


In [16]:
%%time
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 48.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 85.1min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 93.7min finished


CPU times: user 19.1 s, sys: 3.53 s, total: 22.6 s
Wall time: 1h 33min 46s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=[True], criterion='mse',
                                             max_depth=[80, 90, 100, 110],
                                             max_features=[2, 3],
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=[3, 4, 5],
                                             min_samples_split=[8, 10, 12],
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=[10, 100, 200, 300,
                                                           1000],
                                             n_j...lse,
                                             random_state=None, verbose=0,
                    

In [17]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 12,
 'n_estimators': 10}

In [18]:
predOh = grid_search.predict(X_test)

In [19]:
mean_absolute_error(y_test,predOh)

1384430.0572451013

In [20]:
pred = grid_search.predict(test)

In [21]:
res = pd.DataFrame(pred, index=df_test.id, columns=['target'])
display(res.head())
res.to_csv("data/workshop-randomforest-v4_5.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,2444812.0
51775,2342307.0
115253,2176481.0
299321,2009820.0
173570,1882253.0


In [22]:
##Con label encoding

In [23]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import utils.featurizer as ft
import utils.encodeador as my_encoder

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
df_test = pd.read_csv("data/test.csv")

In [25]:
drop_cols = ['titulo','fecha','ciudad','idzona','descripcion', 'direccion', 'lat', 'lng', 'id']

In [26]:
train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio'],
      dtype='object')

In [27]:
train = ft.featurizer(train)
train = my_encoder.label_encoder(train)
test = ft.featurizer(test)
test = my_encoder.label_encoder(test)

In [28]:
some_param = {'bootstrap': True,
 'max_depth': 110,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 1000}

In [29]:
X = train.drop(['precio'] + drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)
y = train['precio']

In [30]:
X.columns

Index(['tipodepropiedad', 'provincia', 'antiguedad', 'habitaciones', 'garages',
       'banos', 'metroscubiertos', 'metrostotales', 'gimnasio',
       'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'year', 'mes', 'dia_del_mes',
       'dia_del_anio', 'quarter', 'aire_libre',
       'metro_promedio_por_cuadricula', 'is_DF'],
      dtype='object')

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [32]:
randomForest = RandomForestRegressor(**some_param)

In [33]:
%%time
randomForest.fit(X_train,y_train)

CPU times: user 3min 14s, sys: 921 ms, total: 3min 15s
Wall time: 3min 15s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=110,
                      max_features=3, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=3, min_samples_split=8,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [34]:
predMe = randomForest.predict(X_test)

In [35]:
mean_absolute_error(y_test,predMe)

690535.8103836384

In [36]:
pred2 = randomForest.predict(test)

In [37]:
res2 = pd.DataFrame(pred2, index=df_test.id, columns=['target'])
display(res2.head())
res.to_csv("data/workshop-randomforest-v4_6.csv", header=True)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
4941,5056882.0
51775,1192559.0
115253,2540687.0
299321,1572990.0
173570,645219.8


In [38]:
feature_importances = pd.DataFrame(randomForest.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
metroscubiertos,0.216994
metrostotales,0.131809
banos,0.103623
metro_promedio_por_cuadricula,0.101638
provincia,0.071292
is_DF,0.067656
tipodepropiedad,0.05805
aire_libre,0.040359
habitaciones,0.035267
garages,0.03436
