In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn import * 

In [16]:
df = pd.read_csv("/data/kaggle/house-prices/data_combined_cleaned.csv")
df = df[~df.SalesPrice.isna()]
del df["Id"]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 79 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1460 no

In [9]:
target = "SalesPrice"
X = df.copy()
del X[target]

X_dummy = pd.get_dummies(X)
y = np.log(df.SalesPrice)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X_dummy,
                            y, test_size = 0.3, random_state = 1)


pipe = pipeline.Pipeline([
    ("scaler", preprocessing.StandardScaler()),
    ("poly", preprocessing.PolynomialFeatures(degree = 2, include_bias=False)),
    ("est", linear_model.LinearRegression())
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
r2_score = metrics.r2_score(y_train, y_train_pred)
print("Train rmse", rmse, "training r2_score", r2_score)

y_test_pred = pipe.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
r2_score = metrics.r2_score(y_test, y_test_pred)
print("Test rmse", rmse, "test r2_score", r2_score)

Train rmse 4.052469544908295e-15 training r2_score 1.0
Test rmse 0.21473830430950336 test r2_score 0.7560047449535685


Observation: training and test performance are significantly different. I want to regularize the models to find optimum performance - training and test performance should be close.  

In [19]:
target = "SalesPrice"
X = df.copy()
del X[target]

X_dummy = pd.get_dummies(X)
y = np.log(df.SalesPrice)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X_dummy,
                            y, test_size = 0.3, random_state = 1)


pipe = pipeline.Pipeline([
    ("scaler", preprocessing.StandardScaler()),
    ("poly", preprocessing.PolynomialFeatures(degree = 1, 
                                    include_bias=False)),
    ("est", linear_model.Lasso(alpha=0.1))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
r2_score = metrics.r2_score(y_train, y_train_pred)
print("Train rmse", rmse, "training r2_score", r2_score)

y_test_pred = pipe.predict(X_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
r2_score = metrics.r2_score(y_test, y_test_pred)
print("Test rmse", rmse, "test r2_score", r2_score)

Train rmse 0.22021427332560542 training r2_score 0.6690890368374893
Test rmse 0.2489567011639626 test r2_score 0.6720482268615199


In [23]:
%%time
param_grid = {
    "poly__degree": [1, 2], 
    "est__alpha": 10 ** np.linspace(-2, 2, 10)
}

grid = model_selection.GridSearchCV(pipe, param_grid, cv = 5, n_jobs = 4,                           
        verbose = True, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: user 4min 17s, sys: 33.2 s, total: 4min 50s
Wall time: 4min 49s


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  4.8min finished


In [24]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=1, include_bias=False, interaction_only=False)), ('est', Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [28]:
np.sqrt(-grid.best_score_)

0.15497457122505764

In [26]:
grid.best_params_

{'est__alpha': 0.01, 'poly__degree': 1}

In [27]:
np.linspace(-2, 2, 10)

array([-2.        , -1.55555556, -1.11111111, -0.66666667, -0.22222222,
        0.22222222,  0.66666667,  1.11111111,  1.55555556,  2.        ])

In [29]:
rmse = np.sqrt(metrics.mean_squared_error(y_test, 
                grid.best_estimator_.predict(X_test)))
print("RMSE of best model against testing dataset", rmse)

RMSE of best model against testing dataset 0.1385713724165416


In [32]:
%%time
param_grid = {
    "poly__degree": [1], 
    "est__alpha": 10 ** np.linspace(-4, -1, 5)
}

grid = model_selection.GridSearchCV(pipe, param_grid, cv = 5,                           
        verbose = True, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
rmse = np.sqrt(metrics.mean_squared_error(y_test, 
                grid.best_estimator_.predict(X_test)))
print("RMSE of best model against testing dataset", rmse)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
RMSE of best model against testing dataset 0.1303049969091853
CPU times: user 1.65 s, sys: 50.8 ms, total: 1.7 s
Wall time: 1.71 s


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    1.7s finished


In [33]:
import pickle

In [34]:
with open("lasso.model", "wb") as f:
    f.write(pickle.dumps(grid.best_estimator_))

In [36]:
!ls -l

total 1928
-rw-r--r--  1 abulbasar  staff    7520 Feb 19 11:30 Day 1 - 01 python basic.ipynb
-rw-r--r--  1 abulbasar  staff   87140 Feb 19 12:39 Day 1 - 02 Pandas dataframe.ipynb
-rw-r--r--  1 abulbasar  staff   15578 Feb 19 18:48 Day 1 - 03 Exercise 1 top 10 movies.ipynb
-rw-r--r--  1 abulbasar  staff  151484 Feb 19 15:09 Day 1 - 04 Visualization (insurance).ipynb
-rw-r--r--  1 abulbasar  staff  165850 Feb 20 12:47 Day 1 - 05 Regression Insurance price.ipynb
-rw-r--r--  1 abulbasar  staff  521240 Feb 20 14:53 Day 2 - 01 Regression Power Plant.ipynb
-rw-r--r--  1 abulbasar  staff   13053 Feb 20 17:09 Day 2 - Regression (kaggle house price).ipynb
-rw-r--r--  1 abulbasar  staff   10834 Feb 20 17:10 lasso.model


In [37]:
with open("lasso.model", "rb") as f:
    est = pickle.load(f)

In [38]:
est

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=1, include_bias=False, interaction_only=False)), ('est', Lasso(alpha=0.0031622776601683794, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False))])

In [39]:
rmse = np.sqrt(metrics.mean_squared_error(y_test, 
                est.predict(X_test)))
print("RMSE of best model against testing dataset", rmse)

RMSE of best model against testing dataset 0.1303049969091853


In [41]:
f_stats, p_vals = feature_selection.f_regression(X_dummy, y)

In [43]:
pd.DataFrame({"feature": X_dummy.columns, 
    "pvalue": p_vals}).sort_values("pvalue")

Unnamed: 0,feature,pvalue
3,OverallQual,0.000000e+00
15,GrLivArea,3.060209e-216
25,GarageCars,3.093756e-199
26,GarageArea,1.106255e-176
11,TotalBsmtSF,7.534551e-151
12,1stFlrSF,1.074698e-141
169,ExterQual_TA,4.749723e-141
18,FullBath,2.118958e-140
5,YearBuilt,1.103567e-135
24,GarageYrBlt,7.405437e-126


Closure to 0 is better for pvalue You can exlude features that have pval>0.05