In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import linregress

from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

In [2]:
os.chdir('/Users/MatthewBarnette/data_science/house_prices/data/')
train = pd.read_csv('train.csv')

In [3]:
train.LotFrontage = pd.to_numeric(train.LotFrontage)

In [4]:
train[train.select_dtypes(include='object').columns.values] = train[train.select_dtypes(include='object').columns.values].fillna('None')
train[train.select_dtypes(exclude='object').columns.values] = train[train.select_dtypes(exclude='object').columns.values].fillna(0)

In [5]:
class MultiLabelEncoder:
    
    def __init__(self,columns=None,nafill=None):
        self.columns = columns
        self.nafill = nafill
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        '''
        Transforms columns specified. If none specified returns all columns
        '''
        if self.columns is not None:
            for column in self.columns:
                X[column] = LabelEncoder().fit_transform(X[column])
        else:
            for colname,column in output.iteritems():
                X[colname] = LabelEncoder().fit_transform(X[column])
        return X
    
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [6]:
train_n = train.select_dtypes(exclude='object')

In [7]:
train_on = MultiLabelEncoder(columns=train.select_dtypes(include='object').columns.values).fit_transform(train)

In [8]:
def corr_columns(df):
    df = df.corr(method='pearson')
    df = df[(df.SalePrice >= .5)]
    df = df[df.index]
    return df, df.SalePrice[0:len(df.index)-1]

In [9]:
train_matrix_on, sale_corr_on = corr_columns(train_on)
train_matrix_n, sale_corr_n = corr_columns(train_n)

In [10]:
sale_corr_on

OverallQual     0.790982
YearBuilt       0.522897
YearRemodAdd    0.507101
TotalBsmtSF     0.613581
1stFlrSF        0.605852
GrLivArea       0.708624
FullBath        0.560664
TotRmsAbvGrd    0.533723
GarageCars      0.640409
GarageArea      0.623431
Name: SalePrice, dtype: float64

In [11]:
sale_corr_n

OverallQual     0.790982
YearBuilt       0.522897
YearRemodAdd    0.507101
TotalBsmtSF     0.613581
1stFlrSF        0.605852
GrLivArea       0.708624
FullBath        0.560664
TotRmsAbvGrd    0.533723
GarageCars      0.640409
GarageArea      0.623431
Name: SalePrice, dtype: float64

In [12]:
train_on_columns = train_on.columns.values
train_n_columns = train_n.columns.values

x_train_on, x_test_on, y_train_on, y_test_on = train_test_split(train_on[train_on_columns[1:-1]],train_on['SalePrice'])
x_train_n, x_test_n, y_train_n, y_test_n = train_test_split(train_n[train_n_columns[1:-1]],train_n['SalePrice'])

In [13]:
feat_select_on = SequentialFeatureSelector(RandomForestRegressor(),k_features=10,forward=True,verbose=2,cv=5,n_jobs=-1,scoring='r2')
feat_select_n = SequentialFeatureSelector(RandomForestRegressor(),k_features=10,forward=True,verbose=2,cv=5,n_jobs=-1,scoring='r2')

In [14]:
feat_select_on.fit(x_train_on,y_train_on)

[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  72 out of  79 | elapsed:    3.6s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  79 out of  79 | elapsed:    3.8s finished

[2018-08-05 19:49:17] Features: 1/10 -- score: 0.6697574571365201[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  78 out of  78 | elapsed:    4.1s finished

[2018-08-05 19:49:22] Features: 2/10 -- score: 0.7171424203130872[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  70 out of  77 | elapsed:    3.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  77 out of  77 | elapsed:    3.5s finished

[2018-08-05 19:49:25] Features: 3/10 -- score: 0.772702677873023[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  76 out of  76 | elapsed:    4.6s finished

[2018-08-05 19:49:30] Features: 4/10 -- score: 0.8233710649877957[Parallel(n_jobs=-1)]: Done  33 tasks      

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
             floating=False, forward=True, k_features=10, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='r2', verbose=2)

In [15]:
print(feat_select_on.subsets_[10])

x_train_on = x_train_on[list(feat_select_on.k_feature_names_)]
x_test_on = x_test_on[list(feat_select_on.k_feature_names_)]

{'feature_idx': (6, 11, 16, 19, 33, 43, 45, 71, 72, 74), 'cv_scores': array([0.87390522, 0.83740673, 0.86616183, 0.86789995, 0.87222995]), 'avg_score': 0.8635207370347053, 'feature_names': ('LotShape', 'Neighborhood', 'OverallQual', 'YearRemodAdd', 'BsmtFinSF1', '2ndFlrSF', 'GrLivArea', 'PoolQC', 'Fence', 'MiscVal')}


In [16]:
feat_select_n.fit(x_train_n,y_train_n)

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    1.5s finished

[2018-08-05 19:50:05] Features: 1/10 -- score: 0.6610293184560352[Parallel(n_jobs=-1)]: Done  28 out of  35 | elapsed:    1.4s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    1.5s finished

[2018-08-05 19:50:07] Features: 2/10 -- score: 0.7046347933945161[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:    1.7s finished

[2018-08-05 19:50:09] Features: 3/10 -- score: 0.7352628366172123[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    1.9s finished

[2018-08-05 19:50:11] Features: 4/10 -- score: 0.7741578515565857[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:    2.1s finished

[2018-08-05 19:50:13] Features: 5/10 -- score: 0.7982018452691719[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:    2.1s finished

[2018-08-05 19:50:15] Features: 6/10 -- score: 0.8150492472149449[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.1s finished

[2018-08-05 19:50:17] Fea

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
             floating=False, forward=True, k_features=10, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='r2', verbose=2)

In [17]:
print(feat_select_n.subsets_[10])

x_train_n = x_train_n[list(feat_select_n.k_feature_names_)]
x_test_n = x_test_n[list(feat_select_n.k_feature_names_)]

{'feature_idx': (1, 2, 3, 4, 5, 6, 12, 13, 16, 20), 'cv_scores': array([0.84637608, 0.83869889, 0.83876872, 0.80919379, 0.81532291]), 'avg_score': 0.8296720784846316, 'feature_names': ('LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'BedroomAbvGr')}


In [18]:
parameters = {'n_estimators':[10,25,50,100],'max_depth':[None,1,2,3],'min_samples_leaf':[1,2,3,4]}

In [37]:
gs_on = GridSearchCV(estimator=RandomForestRegressor(),param_grid=parameters)

In [38]:
gs_on.fit(x_train_on,y_train_on)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 25, 50, 100], 'max_depth': [None, 1, 2, 3], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
gs_on.best_params_

{'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 50}

In [40]:
gs_on.fit(x_train_on,y_train_on)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 25, 50, 100], 'max_depth': [None, 1, 2, 3], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [41]:
gs_on.best_params_

{'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 100}

In [42]:
gs_on_rf = gs_on.best_estimator_

In [43]:
gs_on_rf.fit(x_train_on,y_train_on)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [44]:
predictions_on = gs_on.predict(x_test_on)

In [45]:
np.sqrt(mean_squared_error(y_true=np.log(y_test_on),y_pred=np.log(predictions_on)))

0.17121223599488164

In [46]:
gs_n = GridSearchCV(estimator=RandomForestRegressor(),param_grid=parameters)

In [47]:
gs_n.fit(x_train_n,y_train_n)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 25, 50, 100], 'max_depth': [None, 1, 2, 3], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [48]:
gs_n.best_params_

{'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 50}

In [49]:
gs_n_rf = gs_n.best_estimator_

In [50]:
gs_n_rf.fit(x_train_n,y_train_n)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [51]:
predictions_n = gs_n_rf.predict(x_test_n)

In [52]:
np.sqrt(mean_squared_error(y_true=np.log(y_test_n),y_pred=np.log(predictions_n)))

0.15591493035317877

pipe = Pipeline([('feature_selection', feat_select),
                 ('rfr',RandomForestRegressor())])