In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import linregress

from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

In [2]:
os.chdir('/Users/MatthewBarnette/data_science/house_prices/data/')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
def get_numeric(df,*args):
    for arg in args:
        df[arg] = pd.to_numeric(df[arg])
    df = df.select_dtypes(exclude='object')
    df.fillna(0,inplace=True)
    return df

In [4]:
train = get_numeric(train,'LotFrontage')
test = get_numeric(test,'LotFrontage')
test_id = test.pop('Id')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [5]:
class MultiLabelEncoder:
    
    def __init__(self,columns=None,nafill=None):
        self.columns = columns
        self.nafill = nafill
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        '''
        Transforms columns specified. If none specified returns all columns
        '''
        if self.columns is not None:
            for column in self.columns:
                X[column] = LabelEncoder().fit_transform(X[column])
        else:
            for colname,column in output.iteritems():
                X[colname] = LabelEncoder().fit_transform(X[column])
        return X
    
    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [6]:
def corr_columns(df):
    df = df.corr(method='pearson')
    df = df[(df.SalePrice >= .5)]
    df = df[df.index]
    return df, df.SalePrice[0:len(df.index)-1]

In [7]:
train_columns = train.columns.values

x_train, x_validate, y_train, y_validate = train_test_split(train[train_columns[1:-1]],train['SalePrice'])

In [8]:
feat_select = SequentialFeatureSelector(RandomForestRegressor(),k_features=10,forward=True,verbose=2,cv=5,n_jobs=-1,scoring='r2')

In [9]:
feat_select.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    1.4s finished

[2018-08-11 18:06:40] Features: 1/10 -- score: 0.6835086978750173[Parallel(n_jobs=-1)]: Done  28 out of  35 | elapsed:    1.4s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    1.7s finished

[2018-08-11 18:06:42] Features: 2/10 -- score: 0.7344913358854023[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:    2.6s finished

[2018-08-11 18:06:45] Features: 3/10 -- score: 0.7988703178399053[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    2.3s finished

[2018-08-11 18:06:47] Features: 4/10 -- score: 0.8339092836234435[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:    2.9s finished

[2018-08-11 18:06:50] Features: 5/10 -- score: 0.847814782717818[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:    2.8s finished

[2018-08-11 18:06:53] Features: 6/10 -- score: 0.8567479446453096[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.7s finished

[2018-08-11 18:06:56] Feat

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
             floating=False, forward=True, k_features=10, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='r2', verbose=2)

In [10]:
print(feat_select.subsets_[10])

x_train = x_train[list(feat_select.k_feature_names_)]
x_validate = x_validate[list(feat_select.k_feature_names_)]
test = test[list(feat_select.k_feature_names_)]

{'feature_idx': (3, 6, 8, 9, 12, 15, 18, 19, 23, 24), 'cv_scores': array([0.90381151, 0.76320381, 0.87252372, 0.90773779, 0.85916123]), 'avg_score': 0.8612876116299748, 'feature_names': ('OverallQual', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', '1stFlrSF', 'GrLivArea', 'FullBath', 'HalfBath', 'Fireplaces', 'GarageYrBlt')}


In [11]:
parameters = {'n_estimators':[10,25,50,100],'max_depth':[None,1,2,3],'min_samples_leaf':[1,2,3,4]}

In [12]:
gs = GridSearchCV(estimator=RandomForestRegressor(),param_grid=parameters)

In [13]:
gs.fit(x_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 25, 50, 100], 'max_depth': [None, 1, 2, 3], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
gs.best_params_

{'max_depth': None, 'min_samples_leaf': 2, 'n_estimators': 50}

In [15]:
gs_rf = gs.best_estimator_

In [16]:
gs_rf.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [17]:
predictions = gs_rf.predict(x_validate)

In [18]:
np.sqrt(mean_squared_error(y_true=np.log(y_validate),y_pred=np.log(predictions)))

0.1740172082047193

In [19]:
test['SalePrice'] = gs_rf.predict(test)
test['Id'] = test_id

In [20]:
test = test[['Id','SalePrice']]

In [23]:
os.chdir('/Users/MatthewBarnette/data_science/house_prices/predictions/')
test.to_csv('randomforest.csv',index=False)