In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('input/train.csv')
#data['DistrictId'] = data['DistrictId'].astype(str)

In [12]:
data = data.drop(['LifeSquare', 'Healthcare_1'], axis = 1)
data = pd.get_dummies(data)
train, valid = train_test_split(data, test_size=0.3, random_state=42)
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42, max_features=3)
rfr.fit(train.drop(['Price'], axis=1), train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=None, oob_score=False,
           random_state=42, verbose=0, warm_start=False)

In [13]:
pred_train = rfr.predict(train.drop(['Price'], axis=1))
pred_valid = rfr.predict(valid.drop(['Price'], axis=1))

In [14]:
from sklearn.metrics import r2_score as r2, mean_squared_error as mse
r2(train['Price'], pred_train), mse(train['Price'], pred_train)

(0.8800278239645121, 1035813932.6234682)

In [15]:
r2(valid['Price'], pred_valid), mse(valid['Price'], pred_valid)

(0.7169612862784742, 2434629671.077948)

In [16]:
rfr.feature_importances_

array([0.02373087, 0.05711513, 0.15806727, 0.27193162, 0.06269558,
       0.02969031, 0.03431065, 0.04848477, 0.03922218, 0.06632896,
       0.07163036, 0.06403865, 0.03036719, 0.03571006, 0.00046588,
       0.0004694 , 0.00138896, 0.00146402, 0.00142388, 0.00146426])

In [18]:
feat_imp = pd.DataFrame({'feature': data.columns.drop('Price'), 'importance':rfr.feature_importances_}).sort_values('importance')
feats = feat_imp.tail(10)['feature'].values
feats

array(['Shops_1', 'Ecology_1', 'HouseYear', 'DistrictId', 'KitchenSquare',
       'Social_3', 'Social_1', 'Social_2', 'Rooms', 'Square'],
      dtype=object)

In [28]:
?rfr

In [19]:
rfr = RandomForestRegressor(n_estimators=1000, max_depth=20, random_state=100, max_features=2)
rfr.fit(train[feats], train['Price'])


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=None, oob_score=False,
           random_state=100, verbose=0, warm_start=False)

In [20]:
pred_train = rfr.predict(train[feats])
pred_valid = rfr.predict(valid[feats])

In [21]:
r2(train['Price'], pred_train), mse(train['Price'], pred_train)

(0.9605947634041861, 340216327.09399676)

In [22]:
r2(valid['Price'], pred_valid), mse(valid['Price'], pred_valid)

(0.7256267915170943, 2360091118.0601344)

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
parameters = [{'n_estimators': [900, 1000, 1100],
               'max_features': [1, 2, 3, 4], 
               'max_depth': [18, 19, 20]}]
GSCV = GridSearchCV(estimator=RandomForestRegressor(random_state=100), 
param_grid=parameters,
scoring='r2',
cv=3, verbose=1, n_jobs=3)
GSCV.fit(train[feats], train['Price'])

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:  3.6min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=100, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid=[{'n_estimators': [900, 1000, 1100], 'max_features': [1, 2, 3, 4], 'max_depth': [18, 19, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=1)

In [25]:
GSCV.best_params_

{'max_depth': 18, 'max_features': 3, 'n_estimators': 1100}

In [42]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

In [22]:
GSCV.best_params_

{'max_depth': 19, 'max_features': 2, 'n_estimators': 1000}

In [26]:
y_pred_proba = GSCV.predict(train[feats])
r2(train['Price'], y_pred_proba)

0.9558889222541738

In [27]:
y_pred_proba = GSCV.predict(valid[feats])
r2(valid['Price'], y_pred_proba)

0.7260852630892394