# Project plan
In order to achieve good results, the first thing should be a thorough investigation of features etc. My main goal however is to **familiarise myself with Kaggle submissions and practice using Scikit-Learn**. Thus, I'll just plow ahead with building the model with minimal time spent on data exploration and feature engineering.

## Data cleaning and preparation
* Calculate the NaN-rate for the feature
* Drop features with high NaN-rate from the training set
* Impute values as needed for remaining data
* Turn factor variables into something Python can deal with
* Split into Train/Test sets

## Model building
* Create dummy model
* Train initial model to compare against dummy model
* Test some other regression models quickly
* Perform a gridsearch
* Train model based on best parameters
* Predict on test set and make initial Kaggle submission

## Improvements
In order to have any chance of actually building a competitive model, there would at least be some further steps I would need to look into. Since this falls outside of the actual scope I'm aiming for, I'll just note these down.

* Proper data exploration, feature engineering and selection
* Looking at Kernels submitted by other people for inspiration

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_org = pd.read_csv('data/train.csv',  index_col='Id')

In [3]:
predictions = df_org['SalePrice']
del df_org['SalePrice']
data = df_org

## Data cleaning and preparation

In [4]:
def get_nan_features(training_data):
    '''
    Get features with high NaN-rate from the training set
    
    Let's drop everything with less than 80% data coverage to ensure imputing can 
    make reasonable guesses. Call with training set, so it doesn't drop different 
    features by accident for the test set if it differs widely.
    
    @note: This step would make more sense after exploading out categorical features
        so they are not affected
    '''
    nan_rate = training_data.notnull().sum() / training_data.shape[0]
    
    return nan_rate[nan_rate < 0.8].index

In [5]:
def transform_categorical_features(data):
    '''
    Turn factor variables into binary features, one per category
    
    Looking through the data description file, there are some numerical features 
    that are really factor variables and will need to be transformed as well;
    - MSSubClass, 
    - MoSold: Month Sold (MM)

    In hindsight; dropping NAs for factor variables might've been a mistake as NA 
    can be dropped (or kept) when changing to binary features.
    '''
    factor_variables = ['MSSubClass', 'MSZoning', 'Street', 
                   'LotShape', 'LandContour',
                   'Utilities', 'LotConfig', 'LandSlope', 
                   'Neighborhood', 'Condition1', 'Condition2', 
                   'BldgType', 'HouseStyle', 'RoofStyle',
                   'RoofMatl', 'Exterior1st', 'Exterior2nd',
                   'MasVnrType', 'ExterQual', 'ExterCond',
                   'Foundation', 'BsmtQual', 'BsmtCond',
                   'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                   'Heating', 'HeatingQC', 'CentralAir',
                   'Electrical', 'KitchenQual', 'Functional',
                   'GarageType', 'GarageFinish', 'GarageQual',
                   'GarageCond', 'PavedDrive', 'MoSold',
                   'SaleType', 'SaleCondition']

    return pd.get_dummies(data, columns=factor_variables)

In [6]:
from sklearn.preprocessing import Imputer

def impute_missing_values(data):
    '''
    Impute values as needed for remaining data
    '''    
    impute_nan = Imputer(missing_values='NaN', strategy='median', axis=0)
    impute_nan.fit(data)

    imputed_data = impute_nan.transform(data)

    imputed_df = pd.DataFrame(imputed_data)
    imputed_df.columns = data.columns
    imputed_df.index = data.index
    
    return imputed_df

In [7]:
# Oh, how I miss dplyr pipes...
nan_features = get_nan_features(data)

data = data.drop(nan_features, axis=1)
data = transform_categorical_features(data)
imputed_df = impute_missing_values(data)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(imputed_df, predictions, test_size=0.3, random_state=1337)

## Model building

In [9]:
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
import matplotlib.pyplot as plt

lr = linear_model.LinearRegression()

In [10]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
lr.score(X_test, y_test)

0.86355355339558681

In [12]:
lr_result = pd.DataFrame([int(x) for x in lr.predict(X_test)])
lr_result.columns = ['predictions']
lr_result['sell_price'] = y_test.reset_index().drop('Id', axis=1)

In [13]:
lr_result['difference'] = abs(lr_result['predictions'] - lr_result['sell_price'])
lr_result['difference_rel'] = ['%.2f' % x for x in 100*lr_result['difference']/lr_result['sell_price']]
lr_result.head()

Unnamed: 0,predictions,sell_price,difference,difference_rel
0,260303,259500,803,0.31
1,333908,372500,38592,10.36
2,111911,129500,17589,13.58
3,68943,91000,22057,24.24
4,226516,171000,55516,32.47


In [14]:
# Decent predictions, but some are clearly outputting crazy predictions!

In [15]:
from sklearn.tree import DecisionTreeRegressor

# Define model
tree_model = DecisionTreeRegressor()

# Fit model
tree_model.fit(X_train, y_train)

tree_model.score(X_test, y_test)

0.78179895476400629

In [16]:
tree_model_result = pd.DataFrame([int(x) for x in tree_model.predict(X_test)])
tree_model_result.columns = ['predictions']
tree_model_result['sell_price'] = y_test.reset_index().drop('Id', axis=1)

tree_model_result['difference'] = abs(tree_model_result['predictions'] - tree_model_result['sell_price'])
tree_model_result['difference_rel'] = ['%.2f' % x for x in 100*tree_model_result['difference']/tree_model_result['sell_price']]
tree_model_result.head()

Unnamed: 0,predictions,sell_price,difference,difference_rel
0,190000,259500,69500,26.78
1,369900,372500,2600,0.7
2,110000,129500,19500,15.06
3,85000,91000,6000,6.59
4,135000,171000,36000,21.05


A very simple model thus is about 75% accurate.

In [17]:
from sklearn.ensemble import RandomForestRegressor

regr_rf = RandomForestRegressor(random_state=1337)
regr_rf.fit(X_train, y_train)
regr_rf.score(X_test, y_test)

0.87894955535998132

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

regr_xgb = GradientBoostingRegressor(random_state=1337)
regr_xgb.fit(X_train, y_train)
regr_xgb.score(X_test, y_test)

0.88727931480523126

### Grid search optimisation
Random forest and Gradient Boosting seem to perform about equal, and is already improving predictions. Let's do some grid search for GB to tune hyper parameters.

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('predict', GradientBoostingRegressor(random_state=1337))
])

param_grid = {
    'predict__loss': ['ls', 'huber'],
    'predict__learning_rate': [0.1, 0.05, 0.01],
    'predict__n_estimators': [50, 100, 200],
    'predict__max_depth': [3, 5, 7],
    'predict__min_samples_split': [2, 3, 5]
}

grid = GridSearchCV(pipe, cv=3, param_grid=param_grid)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

0.90076513816581461

No simple wins here it seems when it comes to improving the results, although there's a small improvement. Let's use the settings from the best model this far and retrain a model using the full training data set, and then use that to create predictions for the test set.

In [20]:
# imputed_df, predictions
# grid.bestmodel.fit(imputed_df, predictions)

grid.best_estimator_

Pipeline(steps=[('predict', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=1337, subsample=1.0, verbose=0,
             warm_start=False))])

In [21]:
regr_xgb_tuned = GradientBoostingRegressor(learning_rate=0.05, 
                                     max_depth=5, 
                                     n_estimators=200, 
                                     min_samples_split=3, 
                                     random_state=1337)
regr_xgb_tuned.fit(imputed_df, predictions)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=1337, subsample=1.0, verbose=0,
             warm_start=False)

## Generate predictions for test set
As some categories differ between the training and test set, some columns are added/dropped to ensure the dimensions match

In [22]:
test_df_org = pd.read_csv('data/test.csv',  index_col='Id')

test_df = test_df_org.drop(nan_features, axis=1)
test_df = transform_categorical_features(test_df)
test_df = impute_missing_values(test_df)

In [23]:
# Add these as 0s to test_df
for column in set(imputed_df.columns) - set(test_df.columns):
    test_df[column] = 0

# Remove column missing from training set as it's not usable...
for column in set(test_df.columns) - set(imputed_df.columns):
    del test_df[column]

In [24]:
test_prediction_result = pd.DataFrame([int(x) for x in regr_xgb_tuned.predict(test_df)])
test_prediction_result.index = test_df_org.index
test_prediction_result.columns = ['SalePrice']

In [25]:
test_prediction_result.to_csv('xgb_tuned_predictions.csv')