In [195]:
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
sns.set(style="whitegrid")
import pickle
from sklearn.feature_selection import RFECV

## Step 1: Read in hold out data, scalers, and best model

In [196]:
#holdout = pd.read_csv('resources/movies_holdout_features.csv', index_col=0)
holdout = pd.read_csv('kc_house_data_test_features.csv', index_col=0)

In [197]:
# final_scaler = read_pickle(filename)
# final_model = read_pickle(filename)
file = open('model.pickle', 'rb')
final_model = pickle.load(file)

# final_scaler = open('scaler.pickle')

In [198]:
holdout['bedrooms']=holdout['bedrooms'].replace([33],3)
holdout.loc[holdout['bedrooms'] >= 10]
holdout.loc[holdout['bathrooms'] == 0] 
holdout['bathrooms'] = holdout['bathrooms'].replace([0],.25)

## Step 2: Feature Engineering for holdout set

In [199]:
features = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']

holdout_features = holdout[features]

In [200]:
holdout = holdout.drop(['sqft_living',], axis = 1)

In [201]:
holdout['multilevel'] = np.where(holdout['floors']> 1, 1, 0)
holdout['multilevel'].value_counts()
holdout['basement'] = np.where(holdout['sqft_basement']> 1, 1, 0)

In [202]:
holdout.replace({'sqft_lot': {1105: 1200}}, inplace=True)

In [203]:
holdout['outdoor_space'] = holdout['sqft_lot'] - (holdout['sqft_above']/holdout['floors'])
holdout['modern'] = np.where((holdout['yr_built']>= 2004)|(holdout['yr_renovated']>=2001), 1, 0)
holdout = holdout.drop(['id', 'date','sqft_lot', 'floors', 'sqft_basement', 'grade', 'sqft_lot15', 'sqft_living15'], axis=1)
holdout['outdoor_space^2'] = holdout['outdoor_space']*holdout['outdoor_space']
holdout['sqft_view'] = holdout['sqft_above']*holdout['view']
holdout['sqft_multilevel'] = holdout['sqft_above']*holdout['multilevel']
holdout['outdoor_modern'] = holdout['outdoor_space']*holdout['modern']
holdout['outdoorspace_waterfront'] = holdout['outdoor_space']*holdout['waterfront']
holdout = holdout.drop(['sqft_view','sqft_multilevel','outdoor_space^2'], axis=1)
holdout_dum = pd.get_dummies(holdout['zipcode'])
holdout = pd.concat([holdout, holdout_dum], axis=1)
holdout = holdout.drop(['outdoor_space', 'outdoor_modern', 'outdoorspace_waterfront'], axis=1)

In [204]:
holdout.shape

(4323, 84)

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [205]:
# transformed_holdout = final_scaler(holdout)
scaler = StandardScaler()
sc_features = pd.DataFrame(data=scaler.fit_transform(holdout), columns=holdout.columns)

## Step 3: Predict the holdout set

In [208]:
# final_answers = final_model.predict(transformed_holdout)
final_answers = final_model.predict(sc_features)
final_answers

array([ 2.81167599e+15,  2.81167599e+15, -7.96304685e+15, ...,
        2.45433837e+16, -1.56542271e+16,  2.45433837e+16])

## Step 4: Export your predictions

In [209]:
# final_answer.to_csv('housing_preds_your_name.csv')
pd.DataFrame(final_answers).to_csv('housing_preds_alanratliff.csv')