# Kaggle house price prediction problem

Description of the data 
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

1. Load data_combined_cleaned.csv 
2. Filter out all records where SalePrice is nan
3. Perform one hot encoding on full data set
4. Remove Id column from dataset
5. Divide the data into training and test datasets, use test size = 0.3 and random state = 1
6. Create pipeline to scale the data and fit model
7. Find r2 score based on training data and testing data

Cleaned dataset is located in link below
https://github.com/abulbasar/data/tree/master/kaggle-houseprice


In [3]:
import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt

In [17]:
df = pd.read_csv("/data/kaggle/house-prices/data_combined_cleaned.csv")
df = df[~df.SalesPrice.isnull()]
del df["Id"]

y = df.SalesPrice
X = df.copy()
del X["SalesPrice"]
X_dummy = pd.get_dummies(X, drop_first=True)
X_dummy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Columns: 258 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(11), int64(25), uint8(222)
memory usage: 738.6 KB


In [20]:
%%time 

X_train, X_test, y_train, y_test = model_selection.train_test_split(X_dummy
                        , y, test_size = 0.3, random_state = 1230)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree = 1, 
                                include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Lasso(alpha = 450, tol=0.0001) )
])

pipe.fit(X_train, y_train)
print("train R2", pipe.score(X_train, y_train), 
      "test R2:", pipe.score(X_test, y_test))


train R2 0.918867817322 test R2: 0.838123384039
CPU times: user 74.3 ms, sys: 5.24 ms, total: 79.5 ms
Wall time: 78.4 ms


In [23]:
scores = model_selection.cross_val_score(pipe, 
                    X_dummy, y, cv = 5, verbose=True)

np.mean(scores)

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s finished


0.82456069007898147

In [29]:
pipe1 = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree = 1, 
                                include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Lasso(alpha = 450, tol=0.0001) )
])

pipe2 = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree = 1, 
                                include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Ridge(alpha = 40, tol=0.0001) )
])

scores1 = model_selection.cross_val_score(pipe1, 
                    X_dummy, y, cv = 5, verbose=True)

scores2 = model_selection.cross_val_score(pipe2, 
                    X_dummy, y, cv = 5, verbose=True)


np.mean(scores1), np.mean(scores2)

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


(0.82456069007898147, 0.8179883345147877)

In [42]:
%%time
param_grid = {
    "est__alpha": [500, 700, 850, 800, 970], 
    "est__tol": 10 ** np.linspace(-2, -3, 10)
}
gs = model_selection.GridSearchCV(param_grid= param_grid, 
                        estimator=pipe1, cv = 5, verbose=True)
gs.fit(X_dummy, y)
print("Best param", gs.best_params_, "best score: ", gs.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best param {'est__alpha': 970, 'est__tol': 0.0016681005372000592} best score:  0.832396031775
CPU times: user 8.9 s, sys: 391 ms, total: 9.29 s
Wall time: 9.29 s


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    9.2s finished


In [43]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('poly', PolynomialFeatures(degree=1, include_bias=False, interaction_only=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', Lasso(alpha=970, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0016681005372000592, warm_start=False))])

In [34]:
gs.best_params_

{'est__alpha': 1000}

In [35]:
gs.best_score_

0.83233698641556486

In [44]:
import pickle

In [47]:
with open("house_price_v1.obj", "wb") as f:
    pickle.dump(gs.best_estimator_, f)

In [58]:
!ls -lh

total 1384
-rw-r--r--  1 abulbasar  staff   166K Jan 30 11:35 D1 - 01 pandas dataframe.ipynb
-rw-r--r--  1 abulbasar  staff    33K Jan 30 12:06 D1 - 02 top 10 movies (dataframe ops).ipynb
-rw-r--r--  1 abulbasar  staff   185K Jan 30 13:04 D1 - 03 visualization.ipynb
-rw-r--r--  1 abulbasar  staff    50K Jan 30 15:44 D1 - 04 Regression (Insurance).ipynb
-rw-r--r--  1 abulbasar  staff    46K Jan 30 17:26 D1 - 05 Exercise 2.ipynb
-rw-r--r--  1 abulbasar  staff   166K Jan 31 12:56 Day 2 - 01 Exercise - Kaggle house price prediction .ipynb
-rw-r--r--  1 abulbasar  staff    14K Jan 31 15:04 Day 2 - 02 Cross validation and hyper parameter tuning.ipynb
-rw-r--r--  1 abulbasar  staff     8B Jan 30 11:21 README.md
-rw-r--r--  1 abulbasar  staff   9.3K Jan 31 14:54 house_price_v1.obj


In [59]:
with open("house_price_v1.obj", "rb") as f:
    tuned_model = pickle.load(f)

In [60]:
tuned_model

Pipeline(memory=None,
     steps=[('poly', PolynomialFeatures(degree=1, include_bias=False, interaction_only=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', Lasso(alpha=970, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0016681005372000592, warm_start=False))])

In [61]:
tuned_model.score(X_dummy.iloc[4:10, :], y[4:10])

0.77548860256080387

In [62]:
tuned_model.predict(X_dummy.iloc[4:10, :])

array([ 306033.81826596,  155502.64014781,  265794.52714223,
        217437.95058527,  158230.0905231 ,  101514.50249275])

In [63]:
X_dummy.iloc[4:5, :]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,0,0,1,0,0,0,1,0


In [70]:
tuned_model.steps

[('poly',
  PolynomialFeatures(degree=1, include_bias=False, interaction_only=False)),
 ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('est', Lasso(alpha=970, copy_X=True, fit_intercept=True, max_iter=1000,
     normalize=False, positive=False, precompute=False, random_state=None,
     selection='cyclic', tol=0.0016681005372000592, warm_start=False))]

In [69]:
est = tuned_model.steps[2][1]
est.intercept_, est.coef_, 

(180921.19589041092,
 array([ -5859.95986016,     -0.        ,   2875.08512974,  16086.12635721,
          4722.3814408 ,   6883.23672425,   2643.36406149,   3062.12986409,
          3632.27127573,      0.        ,     -0.        ,   3183.98606473,
             0.        ,      0.        ,   -660.69927526,  24095.94070835,
          2485.42855571,     -0.        ,   1634.35865695,      0.        ,
          -414.52443814,  -1759.68138661,   2285.04168091,   2344.37761672,
             0.        ,   6569.51411885,    292.07859036,   2029.34263759,
            53.59227509,     -0.        ,      0.        ,   1423.09355048,
          4421.09213111,     -0.        ,     -0.        ,      0.        ,
             0.        ,     -0.        ,      0.        ,  -1269.27466836,
           479.68799885,      0.        ,     -0.        ,    647.79665395,
         -1072.09048489,     -0.        ,    385.66179679,     -0.        ,
           182.55103405,   2164.62706698,     -0.        ,     -0. 