used the following as a base of how to process features:

https://www.kaggle.com/klyusba/house-prices-advanced-regression-techniques/lasso-model-for-regression-problem

In [199]:
# import libraries

import pandas as pd
import numpy as np
import math

In [200]:
# read in test and training data

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [201]:
# combine all data for processing

all_data = pd.concat( (train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:, 'MSSubClass':'SaleCondition']), 
                    ignore_index=True)

Handle missing values

In [202]:
# lot frontage - assume if absent, is sqrt or lot area

all_data.loc[all_data['LotFrontage']
             .isnull(), 'LotFrontage'] = np.sqrt(all_data.loc[
        all_data['LotFrontage'].isnull(), 'LotArea'])

Many features where NaN can be considered as an absence of that feature.

For other features, using the median might be more appropriate

In [203]:
# alley
all_data.loc[all_data.Alley.isnull(), 'Alley'] = 'NoAlley'

# masonry vaneer
all_data.loc[all_data.MasVnrType.isnull(), 'MasVnrType'] = 'NoVnr'
all_data.loc[all_data.MasVnrType=='NoVnr', 'MasVnrArea'] = 0

# basement quality, condition, exposure, finish etc.
all_data.loc[all_data.BsmtQual.isnull(), 'BsmtQual'] = 'NoBsmt'
all_data.loc[all_data.BsmtCond.isnull(), 'BsmtCond'] = 'NoBsmt'
all_data.loc[all_data.BsmtExposure.isnull(), 'BsmtExposure'] = 'NoBsmt'
all_data.loc[all_data.BsmtFinType1.isnull(), 'BsmtFinType1'] = 'NoBsmt'
all_data.loc[all_data.BsmtFinType2.isnull(), 'BsmtFinType2'] = 'NoBsmt'
all_data.loc[all_data.BsmtFinType1=='NoBsmt', 'BsmtFinSF1'] = 0
all_data.loc[all_data.BsmtFinType2=='NoBsmt', 'BsmtFinSF2'] = 0
all_data.loc[all_data.BsmtFinSF1.isnull(), 'BsmtFinSF1'] = all_data.BsmtFinSF1.median()
all_data.loc[all_data.BsmtQual=='NoBsmt', 'BsmtUnfSF'] = 0
all_data.loc[all_data.BsmtUnfSF.isnull(), 'BsmtUnfSF'] = all_data.BsmtUnfSF.median()
all_data.loc[all_data.BsmtQual=='NoBsmt', 'TotalBsmtSF'] = 0

# fireplace
all_data.loc[all_data.FireplaceQu.isnull(), 'FireplaceQu'] = 'NoFireplace'

# Garage finish, quality and condition
all_data.loc[all_data.GarageType.isnull(), 'GarageType'] = 'NoGarage'
all_data.loc[all_data.GarageFinish.isnull(), 'GarageFinish'] = 'NoGarage'
all_data.loc[all_data.GarageQual.isnull(), 'GarageQual'] = 'NoGarage'
all_data.loc[all_data.GarageCond.isnull(), 'GarageCond'] = 'NoGarage'
# also one entry has garage Area and Cars null - use typical value for this garage type
all_data.loc[all_data['GarageArea'].isnull(), 'GarageArea'] = all_data.loc[all_data['GarageType']=='Detchd', 'GarageArea'].mean()
all_data.loc[all_data['GarageCars'].isnull(), 'GarageCars'] = all_data.loc[all_data['GarageType']=='Detchd', 'GarageCars'].median()

# bathrooms
all_data.loc[all_data.BsmtFullBath.isnull(), 'BsmtFullBath'] = 0
all_data.loc[all_data.BsmtHalfBath.isnull(), 'BsmtHalfBath'] = 0

# Zoning. Assume if null, is most common type
all_data.loc[all_data.MSZoning.isnull(), 'MSZoning'] = 'RL'

# Utilities
all_data.loc[all_data.Utilities.isnull(), 'Utilities'] = 'AllPub'
all_data.loc[all_data['Electrical'].isnull(), 'Electrical'] = 'SBrkr'

# exterior
all_data.loc[all_data.Exterior1st.isnull(), 'Exterior1st'] = 'VinylSd'
all_data.loc[all_data.Exterior2nd.isnull(), 'Exterior2nd'] = 'VinylSd'
all_data.loc[all_data.Functional.isnull(), 'Functional'] = 'Typ'

# sale condition
all_data.loc[all_data.SaleCondition.isnull(), 'SaleCondition'] = 'Normal'
all_data.loc[all_data.SaleCondition.isnull(), 'SaleType'] = 'WD'

# pool quality
all_data.loc[all_data['PoolQC'].isnull(), 'PoolQC'] = 'NoPool'

# fence
all_data.loc[all_data['Fence'].isnull(), 'Fence'] = 'NoFence'

# other feartures
all_data.loc[all_data['MiscFeature'].isnull(), 'MiscFeature'] = 'None'


Quality based features, or features where order is clear, make numeric

In [204]:
all_data = all_data.replace(
    {'Utilities':{
            'AllPub': 1,
            'NoSeWa': 0,
            'NoSewr': 0,
            'ELO': 0
        },
    'Street':{
            'Pave': 1,
            'Grvl': 0
        },
     'FireplaceQu': {
            'Ex': 5, 
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NoFireplace': 0
        },
     'Fence': {
            'GdPrv': 2,
            'GdWo': 2,
            'MnPrv': 1,
            'MnWW': 1,
            'NoFence': 0
        },
     'ExterQual': {
            'Ex': 5, 
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1
        },
     'ExterCond': {
            'Ex': 5, 
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1
        },
     'BsmtQual': {
            'Ex': 5, 
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NoBsmt': 0
        },
     'BsmtExposure': {
            'Ex': 5, 
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NoBsmt': 0
        },
     'BsmtCond': {
            'Ex': 5, 
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NoBsmt': 0
        },
     'GarageQual': {
            'Ex': 5, 
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1,
            'NoGarage': 0
        },
     'KitchenQual': {
            'Ex': 5, 
            'Gd': 4,
            'TA': 3,
            'Fa': 2,
            'Po': 1
        },
     'Functional': {
            'Typ': 0,
            'Min2': 1,
            'Min1': 1,
            'Mod': 2,
            'Maj1': 3,
            'Maj2': 4,
            'Sev': 5,
            'Sal': 6
        }
    }
)

In [205]:
# Heating QC

all_data = all_data.replace({ 'HeatingQC': {
        'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
    })

In [206]:
# central air
all_data = all_data.replace({'CentralAir': {
            'Y': 1, 'N':0
        }})

# paved drive
all_data = all_data.replace({'PavedDrive': {
            'Y': 1, 'P': 0, 'N': 0
        }})

In [207]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 79 columns):
MSSubClass       2919 non-null int64
MSZoning         2919 non-null object
LotFrontage      2919 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null int64
Alley            2919 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2919 non-null int64
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2919 non-null object
Exterior2nd      2919 non-

In [208]:
# good neighborhoods: 'NridgHt', 'Crawfor', 'StoneBr',
# 'Somerst', 'NoRidge'

all_data = all_data.replace({'Neighborhood': {
           'Blmngtn':0, 'Blueste':0, 'BrDale':0,
            'BrkSide':0, 'ClearCr':0, 'CollgCr':0,
            'Crawfor':1, 'Edwards':0, 'Gilbert':0,
            'IDOTRR': 0, 'MeadowV':0, 'Mitchel':0,
            'Names':0, 'NoRidge':1, 'NPkVill':0,
            'NridgHt':1, 'NWAmes':0, 'OldTown':0,
            'SWISU':0, 'Sawyer':0, 'SawyerW':0,
            'Somerst':1, 'StoneBr':1, 'Timber':0,
            'Veenker':0 
        }})

In [209]:
all_data = all_data.replace({'MasVnrType': {
            'BrkCmn': 1,
            'BrkFace': 1,
            'CBlock': 1,
            'Stone': 1,
            'None': 0
        }})

In [210]:
# Drop:
# MSZoning, Alley, LandContour, LandSlope, Condition1,
# Condition2, BldgType, HouseStyle, RoofStyle, 
# RoofMatl, Exterior1st, Exterior2nd, Foundation,
# BsmtExposure, BsmtFinType1, BsmtFinType2, Heating, 
# Electrical, GarageType, GarageFinish, GarageCond,
# PoolQC, SaleType, SaleCondition


reshape data and prep for model!

In [211]:
X = all_data.drop('MSZoning', axis=1)

to_drop = ['Alley', 'LandContour', 'LandSlope', 
           'Condition1', 'Condition2', 'BldgType', 
           'HouseStyle', 'RoofStyle', 'RoofMatl', 
           'Exterior1st', 'Exterior2nd', 'Foundation',
           'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
           'Heating', 'Electrical', 'GarageType', 
           'GarageFinish', 'GarageCond', 'PoolQC', 
           'SaleType', 'SaleCondition', 'GarageYrBlt',
           'LotShape', 'LotConfig', 'Neighborhood', 
           'MasVnrType', 'Fence', 'MiscFeature', 'MasVnrArea']
X.drop(to_drop, axis=1, inplace=True)


In [212]:
X_train = X[:train.shape[0]]
X_test = X[train.shape[0]:]

In [213]:
y = train.SalePrice

In [214]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score

In [215]:
X_train.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Utilities,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,...,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,76.644196,10516.828082,0.99589,0.999315,6.099315,5.575342,1971.267808,1984.865753,3.39589,...,0.917808,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753
std,42.300571,31.27467,9981.264932,0.063996,0.026171,1.382997,1.112799,30.202904,20.645407,0.57428,...,0.274751,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095
min,20.0,21.0,1300.0,0.0,0.0,1.0,1.0,1872.0,1950.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,60.0,7553.5,1.0,1.0,5.0,5.0,1954.0,1967.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,73.0,9478.5,1.0,1.0,6.0,5.0,1973.0,1994.0,3.0,...,1.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,90.0,11601.5,1.0,1.0,7.0,6.0,2000.0,2004.0,4.0,...,1.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,405.783193,215245.0,1.0,1.0,10.0,9.0,2010.0,2010.0,5.0,...,1.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [216]:
forest = RandomForestRegressor(max_features=len(X_train.columns))

parameter_grid = {
    'max_depth' : [None,1,3,9],
    'min_samples_split' : [1,2,3],
    'n_estimators' : [3,10,30,100,200,400]
}

cross_validation = KFold(len(y),n_folds=5)

grid_search = GridSearchCV(
    forest,
    param_grid=parameter_grid,
    cv=cross_validation
)

grid_search.fit(X_train, y)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

print(grid_search.score(X_train, y))

Best score: 0.859079083405
Best parameters: {'min_samples_split': 3, 'n_estimators': 200, 'max_depth': None}
0.979665606918


In [224]:
X_test.KitchenQual = X_test.KitchenQual.fillna(all_data.KitchenQual.median())

In [225]:
output = grid_search.predict(X_test)
df_output = pd.DataFrame()
df_output['Id'] = test['Id']
df_output['SalePrice'] = output
df_output[['Id', 'SalePrice']].to_csv('results-v3.csv', index=False)