In [1]:
import pandas as pd
import numpy as np 
import pickle
from sklearn.preprocessing import PolynomialFeatures
pd.set_option('display.max_columns', 300)

## Step 1: Read in hold out data, scalers, and best model

In [2]:
df = pd.read_csv('./csvs/kc_house_data_test_features.csv', index_col=0)

In [3]:
# final_scaler = read_pickle(filename)
infile = open("model.pickle",'rb')
model = pickle.load(infile)
infile.close()

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [4]:
# transformed_holdout = final_scaler(holdout)
df['age'] = 2021 - df['yr_built'] 
df['renovated'] = np.where(df['yr_renovated']>0,1,0)
df['basement'] = np.where(df['sqft_basement']>0,1,0)

df["renovation_age"] = df.yr_renovated.apply(lambda x: 2020-x if x > 0 else 0)
df["sale_year"] = df.date.apply(lambda x: int(x[:4])) 
df["sale_quarter"] = df.date.apply(lambda x: int(x[4:6])//3.1 + 1) 

In [5]:
def cap_ba_bd(row):
    if row['bedrooms'] > 11:
        row['bedrooms'] = 3 
    if row['bedrooms'] > 9:
        row['bedrooms'] = 10
    if row['bathrooms'] < 1:
        row['bathrooms'] = 1
    return row

In [6]:
df = df.apply(cap_ba_bd, axis = 1)

In [7]:
ziplist = pd.Series(df["zipcode"])
df = df.merge(pd.get_dummies(ziplist), left_index=True, right_index=True)

In [8]:
df_features = df.drop(['id', 'date', 'lat', 'long'], axis=1)

In [9]:
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_data = poly.fit_transform(df_features)
#poly_columns = poly.get_feature_names(df_features.columns)
df_poly = pd.DataFrame(poly_data)

## Step 3: Predict the holdout set

In [11]:
final_answers = model.predict(df_poly)
len(final_answers), final_answers

(4322,
 array([488900.00071776, 508272.57873976, 388910.15165251, ...,
        340447.20614046, 456736.57102066, 306834.07781786]))

In [12]:
df_final = pd.DataFrame(final_answers)

## Step 4: Export your predictions

In [13]:
df_final.to_csv('housing_preds_David_Shirley.csv')