In [79]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
import pprint as pprint
from scipy import stats
import pickle as pkl
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import cross_val_score
%matplotlib inline

In [80]:
with open("../data/iterate/luther_model_data_full.pkl", 'rb') as picklefile:
    sale = pkl.load(picklefile)
    
# potential zipcode filter to NW side
zips_nw = [60611, 60610, 60654, 60642,
           60622, 60647, 60614, 60657,
           60639, 60641, 60630, 60618,
           60613, 60640, 60625, 60660,
           60626, 60659, 60645]

sale = sale[sale['zipcode'].isin(zips_nw)]

In [89]:
"""
build/filter/transform target and features
"""

model_params = ['price','bedrooms','bathrooms','area','median_income','duration_float','lot_size']#,'year_built']

sale = sale.dropna(subset = model_params)

# filter down to correlation parameters
model = sale[model_params]

#filter out outliers
model = model[(np.abs(stats.zscore(model)) < 3).all(axis=1)]

model['price']=model['price'].apply(np.log10)
model['area']=model['area'].apply(np.log10)

"""
set up train test split
"""
# make data for linear regression
y = model.pop('price').values
X = StandardScaler().fit_transform(model)

# first split out 20% of the data as a validation set
X_training, X_holdout, y_training, y_holdout = train_test_split(X, y, test_size=0.2)

# now split out another 20% for cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3333333)

#build initial regression model

### cross validation testing
#setting up as a polynomial but using degree 1, just to have the easy option later
degree = 1
est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
lr = LinearRegression(fit_intercept=True)

scores_R = cross_val_score(est,
                         X_training,
                         y_training,
                         cv=10)#, scoring='neg_mean_squared_error')
scores_RMSE = cross_val_score(est,
                         X_training,
                         y_training,
                         cv=10, scoring='neg_mean_squared_error')

print(model.shape)
print(np.mean(scores_R))


(1195, 6)
0.7534914602398249


In [88]:
"""
reduce parameters with lasso
"""

# make model
lasso = Lasso()
alphas = np.logspace(-5,1,num=6)
params = {'alpha': alphas, 'fit_intercept': [True,False]}
grid = GridSearchCV(lasso,params, cv=10, scoring='neg_mean_absolute_error', n_jobs=1)
reduce_fit = make_pipeline(PolynomialFeatures(degree), grid)
reduce_fit.fit(X_training, y_training)
print(reduce_fit.named_steps['gridsearchcv'].best_params_)
print(reduce_fit.named_steps['gridsearchcv'].best_score_)
print(reduce_fit.named_steps['gridsearchcv'].best_estimator_.coef_[0:])

{'alpha': 0.0025118864315095794, 'fit_intercept': True}
-0.12160260764841631
[ 0.          0.06400664  0.12634851  0.08070712  0.06842368 -0.10670424
  0.01057356  0.0033557 ]


array([ 0.        ,  0.07436429,  0.13447344,  0.04799825,  0.07816168,
        0.00573184, -0.11188666,  0.01619059])

In [None]:
est_fit = est.fit(X_training,y_training)
est.score(X_holdout,y_holdout)