In [1]:
import pandas as pd
pd.set_option('display.max_columns', 300)
import pickle
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

## Step 1: Read in hold out data, scalers, and best model

In [2]:
df = pd.read_csv('kc_house_data_test_features.csv', index_col=0)

In [3]:
infile = open("final_model.pickle",'rb')
lr_final = pickle.load(infile)
infile.close()

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [4]:
## Pull the year from the "date" column
df['year_sold'] = df['date'].map(lambda x: x[:4])

## Change it from a string to an integer datatype
df['year_sold'] =  df['year_sold'].map(lambda x: int(x))

In [5]:
## Calculating the age of the house at the time of sale
df['y_old_sold'] = df['year_sold'] - df['yr_built']
df['y_old_sold'].describe()

## Minimum age is -1 due to a house being sold before it was finished being built

count    4322.000000
mean       28.757982
std        30.555707
min        -1.000000
25%         5.000000
50%        12.000000
75%        50.000000
max       115.000000
Name: y_old_sold, dtype: float64

In [6]:
## Using the year that the home was renovated to deterine whether or not the home was renovated
reno_y_n = np.where(df['yr_renovated']>0, 1, 0 )
df = df.assign(was_renovated = reno_y_n)

In [7]:
## Using new "was_renovated" feature to calculate how recently the house was renovated at the time of sale
reno = df[df['was_renovated'] == 1]

difference = reno['year_sold'] - reno['yr_renovated']

difference

16       9
17      50
31       1
38      22
51       6
        ..
2332    55
2767     8
3154     6
3157     1
3672     8
Length: 139, dtype: int64

In [8]:
## Ensuring there are no null values in the new feature and replacing any with zeroes
df = df.assign(yrs_since_reno = difference)

df['yrs_since_reno'].fillna(0, inplace=True)

df['yrs_since_reno'].isnull().sum()

df['yrs_since_reno'].describe()

## Minimum age is -1 due to a house being sold before it was finished being built

count    4322.000000
mean        0.567099
std         4.146223
min        -1.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        68.000000
Name: yrs_since_reno, dtype: float64

In [9]:
## Setting coefficients and target
X = df[['was_renovated','waterfront', 'zipcode', 'y_old_sold',
            'yrs_since_reno', 'bedrooms', 'bathrooms', 'condition','grade']].copy()

In [10]:
X

Unnamed: 0,was_renovated,waterfront,zipcode,y_old_sold,yrs_since_reno,bedrooms,bathrooms,condition,grade
0,0,0,98034,47,0.0,4,2.50,3,8
1,0,0,98034,48,0.0,4,2.50,3,8
2,0,0,98029,9,0.0,3,2.50,3,8
3,0,0,98077,38,0.0,3,1.75,3,8
4,0,0,98059,10,0.0,4,2.75,3,8
...,...,...,...,...,...,...,...,...,...
4318,0,0,98103,5,0.0,3,2.50,3,8
4319,0,0,98146,1,0.0,4,2.50,3,8
4320,0,0,98144,5,0.0,2,0.75,3,7
4321,0,0,98027,11,0.0,3,2.50,3,8


In [11]:
## Create list of categorical variables
cat_feats = ['condition','grade', 'zipcode']

## Create the OHE without "drop='first" as it would throw an error in this case
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse=False)

## Using OHE on our categorical variables for training (NOT testing)
final_ohe = ohe.fit_transform(X[cat_feats])

In [12]:
## Getting feature names from our list of categories
feat_col_name = ohe.get_feature_names(cat_feats)

In [13]:
## Creating DF of final results
final_ohe_df1 = pd.DataFrame(final_ohe, columns=feat_col_name, index=X.index)

In [14]:
final_ohe_df = pd.concat([X.drop(cat_feats, axis=1), final_ohe_df1], axis=1)

## Step 3: Predict the holdout set

In [15]:
final_ohe_df = pd.concat([X.drop(cat_feats, axis=1), final_ohe_df1], axis=1)

In [16]:
new_zips = list(df['zipcode'].values)

In [17]:
final_ohe_df['zipcode'] = new_zips

In [18]:
model_columns = final_ohe_df.columns

In [19]:
model_columns = model_columns.values

In [20]:
# model_columns.reshape(1, -1)

In [21]:
final_col = final_ohe_df[model_columns]

In [22]:
final_col = final_col.columns.values

In [23]:
final_pred = lr_final.predict(final_ohe_df[final_col])

## Step 4: Export your predictions

In [24]:
pd.DataFrame(final_pred).to_csv('housing_preds_ben_mccarty.csv')