In [18]:
import numpy as np
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV ,Lasso
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
df1 = pd.read_pickle('df1.pkl')
df1.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,...,Stone,Wood,Attchd,Basment,BuiltIn,CarPort,Detchd,NoGarage,P,Y
0,65.0,8450,7,5,2003,196.0,4,3,4,3,...,0,0,1,0,0,0,0,0,0,1
1,80.0,9600,6,8,1976,0.0,3,3,4,3,...,0,0,1,0,0,0,0,0,0,1
2,68.0,11250,7,5,2001,162.0,4,3,4,3,...,0,0,1,0,0,0,0,0,0,1
3,60.0,9550,7,5,1915,0.0,3,3,3,4,...,0,0,0,0,0,0,1,0,0,1
4,84.0,14260,8,5,2000,350.0,4,3,4,3,...,0,0,1,0,0,0,0,0,0,1


In [3]:
# Defining training and test set
train = df1[df1['YrSold'] < 2010]
test = df1[df1['YrSold'] == 2010]

In [4]:
predictors = list(df1.columns)
predictors.remove('SalePrice')

X_train = train[predictors]
X_test = test[predictors]

y_train = train['SalePrice'].values
y_test = test['SalePrice'].values

In [5]:
baseline = df1['SalePrice'].mean()
baseline

180879.17484450588

In [6]:
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

(1275, 150) (1275,)
(172, 150) (172,)


In [10]:
# Using KBest to determine best features
from sklearn.feature_selection import SelectKBest, chi2, f_regression

# Build the selector — we'll build one with each score type.
skb_f = SelectKBest(f_regression, k=20)
skb_chi2 = SelectKBest(chi2, k=20)

# Train the selector on the data.
skb_f.fit(X_train, y_train)
skb_chi2.fit(X_train, y_train)

# Examine the results.
kbest = pd.DataFrame([predictors, list(skb_f.scores_), list(skb_chi2.scores_)], 
                     index=['feature','f_classif','chi2 score']).T.sort_values('f_classif', ascending=False)
kbest.head(10)

Unnamed: 0,feature,f_classif,chi2 score
2,OverallQual,2206.81,324.257
19,GrLivArea,1163.58,165602.0
6,ExterQual,1119.24,92.5591
25,GarageCars,941.435,284.72
26,GarageArea,886.652,86560.2
34,TotalBath,816.875,262.12
14,TotalBsmtSF,710.098,152982.0
8,BsmtQual,696.737,205.044
16,1stFlrSF,683.199,106890.0
22,TotRmsAbvGrd,492.848,317.92


In [11]:
ss = StandardScaler()
Xs = ss.fit_transform(X_train)
Xst = ss.fit_transform(X_test)

Based on Kbest, the following features 
**['Other','OverallQual','Neighbourhood', 'ExterQual', 'LotArea', 'LandContour','GrLivArea','BsmtQual','TotalBath']**

In [12]:
optimal_lasso = LassoCV(n_alphas=500, cv=10, verbose=1)
optimal_lasso.fit(Xs, y_train)

lasso = Lasso(alpha=optimal_lasso.alpha_)
lasso_scores = cross_val_score(lasso, Xs, y_train, cv=10)

print lasso_scores
print np.mean(lasso_scores)

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

[0.89317874 0.8847524  0.88253239 0.78965891 0.87640269 0.86225131
 0.86427754 0.83002839 0.55949146 0.87833944]
0.8320913266581389


In [13]:
lasso.fit(Xs, y_train)
lasso_coefs = pd.DataFrame({'variable':X_train.columns,
                            'coef':lasso.coef_,
                            'abs_coef':np.abs(lasso.coef_)})

lasso_coefs.sort_values('abs_coef', inplace=True, ascending=False)

lasso_coefs.head(10)

Unnamed: 0,abs_coef,coef,variable
19,17291.535031,17291.535031,GrLivArea
2,15609.037282,15609.037282,OverallQual
108,11399.906081,11399.906081,NridgHt
25,7304.02483,7304.02483,GarageCars
34,7196.134316,7196.134316,TotalBath
114,6700.226254,6700.226254,StoneBr
107,6535.605067,6535.605067,NoRidge
6,6263.608148,6263.608148,ExterQual
22,5932.855615,5932.855615,TotRmsAbvGrd
10,5925.678606,5925.678606,BsmtExposure


In [14]:
# Experiment with Features
cols = ['OverallQual',
        'TotalBath',
        'TotRmsAbvGrd',
        'YearBuilt',
        'ExterQual', 
        'LotArea', 
        'GrLivArea',
        'BsmtQual', 
        'GarageCars',
        'CollgCr','OldTown','NAmes','SawyerW','Sawyer','Edwards','Somerst','Gilbert','NridgHt','NWAmes','BrkSide',
        'Crawfor','Mitchel','NoRidge','Timber','IDOTRR','ClearCr','SWISU','StoneBr','MeadowV','BrDale',
        'Veenker','NPkVill','Blueste',]

X_train = train[cols]
X_test = test[cols]

In [15]:
ss = StandardScaler()
Xs = ss.fit_transform(X_train)
Xst = ss.fit_transform(X_test)

In [16]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

lr = linear_model.LinearRegression()
model = lr.fit(Xs, y_train)

print('Training accuracy:', lr.score(Xs, y_train))
print('Test accuracy:', lr.score(Xst, y_test))

('Training accuracy:', 0.8242055440648433)
('Test accuracy:', 0.8257919378567656)


The above features explain approximately 83% of the variance in our target variable