In [35]:
# initially adopted from blog post by D. Ziganto (dziganto.github.io)

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
from scipy import stats
import pickle as pkl
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import cross_val_score
%matplotlib inline

In [36]:
with open("../data/iterate/luther_model_data_1.pkl", 'rb') as picklefile:
    sale = pkl.load(picklefile)
    
# potential zipcode filter to NW side
zips_nw = [60611, 60610, 60654, 60642,
           60622, 60647, 60614, 60657,
           60639, 60641, 60630, 60618,
           60613, 60640, 60625, 60660,
           60626, 60659, 60645]

sale_nw = sale[sale['zipcode'].isin(zips_nw)]

In [42]:
sale.columns

Index(['url', 'address', 'city_state', 'price', 'address', 'bedrooms',
       'bathrooms', 'area', 'year_built', 'duration_float', 'mean_income',
       'median_income', 'zipcode'],
      dtype='object')

In [43]:
"""
build/filter/transform target and features
"""

model_params = ['price','bedrooms','bathrooms','area','median_income','year_built'] #'duration_float',

# filter down to correlation parameters
model = sale_nw[model_params]

#filter out outliers
model = model[(np.abs(stats.zscore(model)) < 2).all(axis=1)]

model['price']=model['price'].apply(np.log10)
model['area']=model['area'].apply(np.log10)

# make data for linear regression
y = model.pop('price').values
X = StandardScaler().fit_transform(model)

In [44]:
# intermediate/test split (gives us test set)
X_intermediate, X_test, y_intermediate, y_test = train_test_split(X, 
                                                                  y,
                                                                  test_size=0.2, 
                                                                  random_state=15)

# train/validation split (gives us train and validation sets)
X_train, X_validation, y_train, y_validation = train_test_split(X_intermediate,
                                                                y_intermediate,
                                                                test_size=0.25,
                                                                random_state=2018)

In [45]:
def calc_train_error(X_train, y_train, model):
    '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_validation_error(X_test, y_test, model):
    '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return mse
    
def calc_metrics(X_train, y_train, X_test, y_test, model):
    '''fits model and returns the RMSE for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [46]:
lr = LinearRegression(fit_intercept=True)

train_error, test_error = calc_metrics(X_train, y_train, X_validation, y_validation, lr)
train_error, test_error = round(train_error, 3), round(test_error, 3)

print('train error: {} | test error: {}'.format(train_error, test_error))
print('train/test: {}'.format(round(test_error/train_error, 1)))

train error: 0.027 | test error: 0.03
train/test: 1.1


In [47]:
# delete intermediate variables
del X_intermediate, y_intermediate

# print proportions
print('train: {}% | validation: {}% | test {}%'.format(round(len(y_train)/len(y),2),
                                                       round(len(y_validation)/len(y),2),
                                                       round(len(y_test)/len(y),2)))

train: 0.6% | validation: 0.2% | test 0.2%
