Exercise: Build a prediction model for EP based on all features. Find R2 and MSE score on test data and training data. 

https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, preprocessing, metrics, pipeline
%matplotlib inline

In [6]:
df = pd.read_csv("/data/kaggle/house-prices/data_combined_cleaned.csv")
del df["Id"]
df = df[~np.isnan(df["SalesPrice"])]
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalesPrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,...,0,,,,0,2,2008,WD,Normal,208500.0
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,...,0,,,,0,5,2007,WD,Normal,181500.0
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,...,0,,,,0,9,2008,WD,Normal,223500.0
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,...,0,,,,0,12,2008,WD,Normal,250000.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 79 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1460 no

In [8]:
target = "SalesPrice"
X = df.copy()
del X[target]
y = np.log(df[target])
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                    test_size = 0.3, random_state = 123) 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Lasso(alpha=0.01))
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training mse:", metrics.mean_squared_error(y_train, y_train_pred), 
      "\ntesting mse:", metrics.mean_squared_error(y_test, y_test_pred), 
      "\ntraining r2: ", pipe.score(X_train, y_train),
      "\ntesting r2: ", pipe.score(X_test, y_test))

training mse: 0.018098827677061335 
testing mse: 0.015542696024803503 
training r2:  0.8906252307539919 
testing r2:  0.892702031118294


In [12]:
scores = model_selection.cross_val_score(pipe, X_train, y_train, cv = 5)
print(scores, "cv score: ", np.mean(scores))

[0.83668219 0.62674451 0.88160527 0.89229946 0.69965229] cv score:  0.7873967419668055


In [21]:
pipe_lasso = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Lasso(max_iter=5000))
])

pipe_ridge = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Ridge(max_iter=5000))
])

# When we compare two models, we should compare the CV scores, 
# because that is more reliable than invidual scores.
scores1 = model_selection.cross_val_score(pipe_lasso, X_train, y_train, cv = 5)
scores2 = model_selection.cross_val_score(pipe_ridge, X_train, y_train, cv = 5)
print("Lasso:", np.mean(scores), "Ridge: ", np.mean(scores2))

Lasso: 0.7873967419668055 Ridge:  0.7218628926956134


In [31]:
param_grid = {
    "est__alpha": np.linspace(0.019, 0.021, 10)
}

gs_lasso = model_selection.GridSearchCV(pipe_lasso, param_grid=param_grid, 
                                    cv=5, verbose = True)
gs_lasso.fit(X_train, y_train)
print("Best score", gs_lasso.best_score_, "best params: ", gs_lasso.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best score 0.7949303491571532 best params:  {'est__alpha': 0.019444444444444445}


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.3s finished


In [27]:
gs_lasso.best_params_

{'est__alpha': 0.021544346900318832}

In [24]:
gs_lasso.best_score_

0.7947708147981087

In [25]:
gs_lasso.best_estimator_

Pipeline(memory=None,
     steps=[('poly', PolynomialFeatures(degree=1, include_bias=False, interaction_only=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', Lasso(alpha=0.021544346900318832, copy_X=True, fit_intercept=True,
   max_iter=5000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False))])

In [16]:
np.linspace(-3, -1, 10)

array([-3.        , -2.77777778, -2.55555556, -2.33333333, -2.11111111,
       -1.88888889, -1.66666667, -1.44444444, -1.22222222, -1.        ])

In [17]:
10 ** np.linspace(-3, -1, 10)

array([0.001     , 0.0016681 , 0.00278256, 0.00464159, 0.00774264,
       0.0129155 , 0.02154435, 0.03593814, 0.05994843, 0.1       ])

In [38]:
param_grid = {
    "est__alpha": np.linspace(900, 950, 20)
}

gs_ridge = model_selection.GridSearchCV(pipe_ridge, param_grid=param_grid, 
                                    cv=5, verbose = True)
gs_ridge.fit(X_train, y_train)
print("Best score", gs_lasso.best_score_, "best params: ", gs_ridge.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best score 0.7949303491571532 best params:  {'est__alpha': 928.9473684210526}


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.5s finished
