In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import norm 
import math
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
import statsmodels.api as sm
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
import fiona
from shapely.geometry import Point, Polygon
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import seaborn as sns
plt.style.use('seaborn')
sns.set(style="white")
pd.set_option('display.max_columns', 300)

## Step 1: Read in hold out data, scalers, and best model

In [2]:
import pickle

# scaler_file = open('scaler.pickle','rb')
# final_scaler = pickle.load(scaler_file)
model_file = open('model.pickle','rb')
final_model = pickle.load(model_file)
# scaler_file.close()
model_file.close()

In [3]:
ht= pd.read_csv('kc_house_data_test_features.csv', index_col=0)

In [4]:
zip_grade = pd.read_csv('Niche.csv')
ht = zip_grade.set_index('zipcode').join(ht.set_index('zipcode'))

In [5]:
list(ht)

['zip_rank',
 'niche_grade',
 'school_grade',
 'population',
 'id',
 'date',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [6]:
ht = ht.reset_index()

In [7]:
ht = ht.dropna(subset = ['id'])


In [8]:
list(ht)

['zipcode',
 'zip_rank',
 'niche_grade',
 'school_grade',
 'population',
 'id',
 'date',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [9]:
def cap_sqft(row):
    if row['sqft_lot'] > 77776:
        row['sqft_lot'] = 13126.23
    if row['sqft_living'] > 4669:
        row['sqft_living'] = 2069.47
    if row['sqft_above'] > 4157:
        row['sqft_above'] = 1780
    if row['sqft_basement'] > 1572:
        row['sqft_basement'] = 288 
    return row

In [10]:
ht = ht.apply(cap_sqft, axis = 1)

In [11]:
def zero_val_bed_bath(row):
    """
    Checking extreame number of rooms in the house
    """
    if row['bedrooms'] == 0:
        row['bedrooms'] = row['floors']
    if row['bathrooms'] < 1:
        row['bathrooms'] = 1
    if row['bedrooms'] > 10 :
        row['bedrooms'] = 10
    return row

In [12]:
ht = ht.apply(zero_val_bed_bath, axis = 1)

In [13]:
ht.drop(columns = ['zip_rank', 'id', 'view', 'sqft_living15', 'sqft_lot15'], inplace = True)

In [14]:
def define_niche_grade(row):
    
    if row['niche_grade'] == 'A+':
        row['niche_grade'] = 1
    if row['niche_grade'] == 'A+ ':
        row['niche_grade'] = 1
    if row['niche_grade'] == 'A':
        row['niche_grade'] = 2
    if row['niche_grade'] == 'A-':
        row['niche_grade'] = 3
    if row['niche_grade'] == 'B+':
        row['niche_grade'] = 4
    if row['niche_grade'] == 'B':
        row['niche_grade'] = 5
    if row['niche_grade'] == 'B-':
        row['niche_grade'] = 6
    
    return row 

In [15]:
ht = ht.apply(define_niche_grade, axis = 1)

In [16]:
def define_school_grade(row):
    
    if row['school_grade'] == 'A+':
        row['school_grade'] = 1
    if row['school_grade'] == 'A+ ':
        row['school_grade'] = 1
    if row['school_grade'] == 'A ':
        row['school_grade'] = 2
    if row['school_grade'] == 'A':
        row['school_grade'] = 2
    if row['school_grade'] == 'A-':
        row['school_grade'] = 3
    if row['school_grade'] == 'A- ':
        row['school_grade'] = 3
    if row['school_grade'] == 'B+':
        row['school_grade'] = 4
    if row['school_grade'] == 'B':
        row['school_grade'] = 5
    if row['school_grade'] == 'B-':
        row['school_grade'] = 6
    if row['school_grade'] == 'C+':
        row['school_grade'] = 7
    
    return row 

In [17]:
ht = ht.apply(define_school_grade, axis = 1)

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [18]:
ht['yr_updated'] = np.nan

In [19]:
def yr_update(row):
    
    if row['yr_renovated'] == 0:
        row['yr_updated'] = 2021 - row['yr_built']
    if row['yr_renovated'] != 0:
        row['yr_updated'] = 2021 - row['yr_built']
        
    return row 

In [20]:
ht = ht.apply(yr_update, axis = 1)

In [21]:
ht['percent_bedbath'] = np.nan
ht['has_golden_ratio'] = np.nan

In [22]:
def cal_ratio_range(row):
    golden_ratio = (2/3)
    golden_ratio_plus = golden_ratio + (golden_ratio * .10)
    golden_ratio_minus = golden_ratio - (golden_ratio * .10)
    
    if row['percent_bedbath'] <= golden_ratio_plus and row['percent_bedbath'] >= golden_ratio_minus:
            row['has_golden_ratio'] = 1
    else:
        row['has_golden_ratio'] = 0
    
    return row

In [23]:
ht = ht.apply(cal_ratio_range, axis = 1)

In [24]:
# Based off of bathrooms pros and housetipster 

def ratio_bed_bath(row):
    
    ratio_bed_bath = row['bathrooms'] / row['bedrooms']
    golden_ratio = (2/3)
    row['percent_bedbath'] = abs(golden_ratio - ratio_bed_bath) 
    
    return row

In [25]:
ht = ht.apply(ratio_bed_bath, axis = 1)

In [26]:
ht['ratio_liv_lot'] = np.nan

In [27]:
def ratio_living_lot(row):
    
    row['ratio_liv_lot'] = row['sqft_lot'] / row['sqft_living']
    return row

In [28]:
ht = ht.apply(ratio_living_lot, axis = 1)

In [29]:
ht = pd.concat([ht, pd.get_dummies(ht['zipcode'])], 1)

In [30]:
ht = ht.drop(columns = 'zipcode')

In [31]:
ht = pd.concat([ht, pd.get_dummies(ht['grade'])], 1)

In [32]:
ht = ht.drop(columns = 'grade')

In [33]:
# ht.columns = ht.columns.astype(str)

In [34]:
ht

Unnamed: 0,niche_grade,school_grade,population,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,yr_updated,percent_bedbath,has_golden_ratio,ratio_liv_lot,98001,98002,98003,98004,98005,98006,98007,98008,98010,98011,98014,98019,98022,98023,98024,98027,98028,98029,98030,98031,98032,98033,98034,98038,98039,98040,98042,98045,98052,98053,98055,98056,98058,98059,98065,98070,98072,98074,98075,98077,98092,98102,98103,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98122,98125,98126,98133,98136,98144,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199,1,4,5,6,7,8,9,10,11,12,13
0,5,6,32625,20140513T000000,3.0,2.50,1950.0,8251.0,2.0,0,3,1950,0,1990,0,47.3430,-122.280,31,0.166667,0,4.231282,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,5,6,32625,20141201T000000,3.0,2.00,1790.0,7879.0,1.5,0,3,1790,0,1998,0,47.2634,-122.289,23,0.000000,0,4.401676,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,5,6,32625,20140610T000000,3.0,1.00,920.0,9812.0,1.0,0,4,920,0,1962,0,47.2958,-122.284,59,0.333333,0,10.665217,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,5,6,32625,20140626T000000,3.0,1.50,1660.0,15600.0,2.0,0,3,1660,0,1981,0,47.2589,-122.279,40,0.166667,0,9.397590,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,5,6,32625,20140924T000000,3.0,1.75,1230.0,12000.0,1.0,0,3,1230,0,1970,0,47.2878,-122.251,51,0.083333,0,9.756098,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4318,2,3,21954,20150402T000000,3.0,3.25,1550.0,1280.0,2.0,0,3,1220,330,2013,0,47.6493,-122.384,8,0.416667,0,0.825806,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4319,2,3,21954,20141015T000000,4.0,3.50,3660.0,4760.0,2.0,0,3,2840,820,2014,0,47.6482,-122.409,7,0.208333,0,1.300546,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4320,2,3,21954,20141112T000000,4.0,3.25,3610.0,4000.0,2.0,0,3,2640,970,2007,0,47.6580,-122.396,14,0.145833,0,1.108033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4321,2,3,21954,20140801T000000,3.0,2.50,1510.0,1618.0,2.5,0,3,1330,180,2011,0,47.6515,-122.384,10,0.166667,0,1.071523,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0


In [35]:
ht = ht.rename(columns={1: 'lowest_g', 4:'dnmc', 5:'Poor', 6:'bare_min', 
                   7:'average', 8:'above_avg', 9:'good', 10:'high_qua', 11:'higher_qua',
                   12:'excellent_qua', 13:'mansion' })

In [36]:
ht

Unnamed: 0,niche_grade,school_grade,population,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,yr_updated,percent_bedbath,has_golden_ratio,ratio_liv_lot,98001,98002,98003,98004,98005,98006,98007,98008,98010,98011,98014,98019,98022,98023,98024,98027,98028,98029,98030,98031,98032,98033,98034,98038,98039,98040,98042,98045,98052,98053,98055,98056,98058,98059,98065,98070,98072,98074,98075,98077,98092,98102,98103,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98122,98125,98126,98133,98136,98144,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199,lowest_g,dnmc,Poor,bare_min,average,above_avg,good,high_qua,higher_qua,excellent_qua,mansion
0,5,6,32625,20140513T000000,3.0,2.50,1950.0,8251.0,2.0,0,3,1950,0,1990,0,47.3430,-122.280,31,0.166667,0,4.231282,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,5,6,32625,20141201T000000,3.0,2.00,1790.0,7879.0,1.5,0,3,1790,0,1998,0,47.2634,-122.289,23,0.000000,0,4.401676,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,5,6,32625,20140610T000000,3.0,1.00,920.0,9812.0,1.0,0,4,920,0,1962,0,47.2958,-122.284,59,0.333333,0,10.665217,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,5,6,32625,20140626T000000,3.0,1.50,1660.0,15600.0,2.0,0,3,1660,0,1981,0,47.2589,-122.279,40,0.166667,0,9.397590,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,5,6,32625,20140924T000000,3.0,1.75,1230.0,12000.0,1.0,0,3,1230,0,1970,0,47.2878,-122.251,51,0.083333,0,9.756098,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4318,2,3,21954,20150402T000000,3.0,3.25,1550.0,1280.0,2.0,0,3,1220,330,2013,0,47.6493,-122.384,8,0.416667,0,0.825806,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4319,2,3,21954,20141015T000000,4.0,3.50,3660.0,4760.0,2.0,0,3,2840,820,2014,0,47.6482,-122.409,7,0.208333,0,1.300546,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4320,2,3,21954,20141112T000000,4.0,3.25,3610.0,4000.0,2.0,0,3,2640,970,2007,0,47.6580,-122.396,14,0.145833,0,1.108033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4321,2,3,21954,20140801T000000,3.0,2.50,1510.0,1618.0,2.5,0,3,1330,180,2011,0,47.6515,-122.384,10,0.166667,0,1.071523,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0


In [37]:
ht['niche_grade^2'] = ht['niche_grade'] * ht['niche_grade']

In [38]:
ht['niche_grade_yr_updated'] = ht['niche_grade'] * ht['yr_updated']

In [39]:
ht['niche_grade_ratio_liv_lot'] =  ht['niche_grade'] * ht['ratio_liv_lot']

In [40]:
ht['yr_updated_ratio_liv_lot'] = ht['yr_updated'] * ht['ratio_liv_lot']

In [41]:
ht

Unnamed: 0,niche_grade,school_grade,population,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,yr_updated,percent_bedbath,has_golden_ratio,ratio_liv_lot,98001,98002,98003,98004,98005,98006,98007,98008,98010,98011,98014,98019,98022,98023,98024,98027,98028,98029,98030,98031,98032,98033,98034,98038,98039,98040,98042,98045,98052,98053,98055,98056,98058,98059,98065,98070,98072,98074,98075,98077,98092,98102,98103,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98122,98125,98126,98133,98136,98144,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199,lowest_g,dnmc,Poor,bare_min,average,above_avg,good,high_qua,higher_qua,excellent_qua,mansion,niche_grade^2,niche_grade_yr_updated,niche_grade_ratio_liv_lot,yr_updated_ratio_liv_lot
0,5,6,32625,20140513T000000,3.0,2.50,1950.0,8251.0,2.0,0,3,1950,0,1990,0,47.3430,-122.280,31,0.166667,0,4.231282,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,25,155,21.156410,131.169744
1,5,6,32625,20141201T000000,3.0,2.00,1790.0,7879.0,1.5,0,3,1790,0,1998,0,47.2634,-122.289,23,0.000000,0,4.401676,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,25,115,22.008380,101.238547
2,5,6,32625,20140610T000000,3.0,1.00,920.0,9812.0,1.0,0,4,920,0,1962,0,47.2958,-122.284,59,0.333333,0,10.665217,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,25,295,53.326087,629.247826
3,5,6,32625,20140626T000000,3.0,1.50,1660.0,15600.0,2.0,0,3,1660,0,1981,0,47.2589,-122.279,40,0.166667,0,9.397590,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,25,200,46.987952,375.903614
4,5,6,32625,20140924T000000,3.0,1.75,1230.0,12000.0,1.0,0,3,1230,0,1970,0,47.2878,-122.251,51,0.083333,0,9.756098,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,25,255,48.780488,497.560976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4318,2,3,21954,20150402T000000,3.0,3.25,1550.0,1280.0,2.0,0,3,1220,330,2013,0,47.6493,-122.384,8,0.416667,0,0.825806,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,4,16,1.651613,6.606452
4319,2,3,21954,20141015T000000,4.0,3.50,3660.0,4760.0,2.0,0,3,2840,820,2014,0,47.6482,-122.409,7,0.208333,0,1.300546,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,4,14,2.601093,9.103825
4320,2,3,21954,20141112T000000,4.0,3.25,3610.0,4000.0,2.0,0,3,2640,970,2007,0,47.6580,-122.396,14,0.145833,0,1.108033,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,4,28,2.216066,15.512465
4321,2,3,21954,20140801T000000,3.0,2.50,1510.0,1618.0,2.5,0,3,1330,180,2011,0,47.6515,-122.384,10,0.166667,0,1.071523,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,4,20,2.143046,10.715232


In [42]:
ht = pd.concat([ht, pd.get_dummies(ht['waterfront'])], 1)

In [43]:
ht = ht.rename(columns={0: "No_Waterfront", 1: "Waterfront"})

In [44]:
ht['water_sqft_lot'] = np.nan

In [45]:
def water_lot(row):
    if row['waterfront'] == 1:
        row['water_sqft_lot'] = row['Waterfront'] * row['sqft_lot'] 
    if row['waterfront'] == 0:
        row['water_sqft_lot'] = 0
    return row

In [46]:
ht = ht.apply(water_lot, axis=1)

In [47]:
ht = ht.drop(columns='waterfront')

In [48]:

ht = ht.drop(columns= 'date')
ht = ht.drop(columns= 'lat')
ht = ht.drop(columns= 'long')

In [49]:
effect_feat = ['school_grade', 'population', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'yr_built',
       'yr_renovated', 'yr_updated', 'has_golden_ratio', 'Poor', 'bare_min',
       'average', 'good', 'high_qua', 'higher_qua', 'excellent_qua', 'mansion',
       'No_Waterfront', 'Waterfront', 'water_sqft_lot', '98001', '98002',
       '98003', '98004', '98005', '98006', '98022', '98023', '98030', '98031',
       '98032', '98033', '98038', '98039', '98040', '98042', '98053', '98055',
       '98058', '98074', '98075', '98077', '98092', '98102', '98105', '98106',
       '98108', '98109', '98112', '98118', '98119', '98133', '98146', '98155',
       '98168', '98178', '98188', '98198', '98199', 'niche_grade',
       'yr_updated', 'ratio_liv_lot', 'niche_grade^2',
       'niche_grade_yr_updated', 'niche_grade_ratio_liv_lot',
       'yr_updated_ratio_liv_lot']

In [52]:
ht.columns = ht.columns.astype(str)

In [53]:
list(ht)

['niche_grade',
 'school_grade',
 'population',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'condition',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'yr_updated',
 'percent_bedbath',
 'has_golden_ratio',
 'ratio_liv_lot',
 '98001',
 '98002',
 '98003',
 '98004',
 '98005',
 '98006',
 '98007',
 '98008',
 '98010',
 '98011',
 '98014',
 '98019',
 '98022',
 '98023',
 '98024',
 '98027',
 '98028',
 '98029',
 '98030',
 '98031',
 '98032',
 '98033',
 '98034',
 '98038',
 '98039',
 '98040',
 '98042',
 '98045',
 '98052',
 '98053',
 '98055',
 '98056',
 '98058',
 '98059',
 '98065',
 '98070',
 '98072',
 '98074',
 '98075',
 '98077',
 '98092',
 '98102',
 '98103',
 '98105',
 '98106',
 '98107',
 '98108',
 '98109',
 '98112',
 '98115',
 '98116',
 '98117',
 '98118',
 '98119',
 '98122',
 '98125',
 '98126',
 '98133',
 '98136',
 '98144',
 '98146',
 '98148',
 '98155',
 '98166',
 '98168',
 '98177',
 '98178',
 '98188',
 '98198',
 '98199',
 'lowest_g',
 'dnmc',
 'Poor',
 'b

In [54]:
ht[effect_feat]

Unnamed: 0,school_grade,population,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,yr_built,yr_renovated,yr_updated,has_golden_ratio,Poor,bare_min,average,good,high_qua,higher_qua,excellent_qua,mansion,No_Waterfront,Waterfront,water_sqft_lot,98001,98002,98003,98004,98005,98006,98022,98023,98030,98031,98032,98033,98038,98039,98040,98042,98053,98055,98058,98074,98075,98077,98092,98102,98105,98106,98108,98109,98112,98118,98119,98133,98146,98155,98168,98178,98188,98198,98199,niche_grade,yr_updated.1,ratio_liv_lot,niche_grade^2,niche_grade_yr_updated,niche_grade_ratio_liv_lot,yr_updated_ratio_liv_lot
0,6,32625,3.0,2.50,1950.0,8251.0,2.0,1950,0,1990,0,31,0,0,0,1,0,0,0,0,0,1,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,31,4.231282,25,155,21.156410,131.169744
1,6,32625,3.0,2.00,1790.0,7879.0,1.5,1790,0,1998,0,23,0,0,0,1,0,0,0,0,0,1,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,23,4.401676,25,115,22.008380,101.238547
2,6,32625,3.0,1.00,920.0,9812.0,1.0,920,0,1962,0,59,0,0,0,1,0,0,0,0,0,1,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,59,10.665217,25,295,53.326087,629.247826
3,6,32625,3.0,1.50,1660.0,15600.0,2.0,1660,0,1981,0,40,0,0,0,1,0,0,0,0,0,1,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,40,9.397590,25,200,46.987952,375.903614
4,6,32625,3.0,1.75,1230.0,12000.0,1.0,1230,0,1970,0,51,0,0,1,0,0,0,0,0,0,1,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,51,9.756098,25,255,48.780488,497.560976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4318,3,21954,3.0,3.25,1550.0,1280.0,2.0,1220,330,2013,0,8,0,0,0,0,1,0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,8,0.825806,4,16,1.651613,6.606452
4319,3,21954,4.0,3.50,3660.0,4760.0,2.0,2840,820,2014,0,7,0,0,0,0,1,0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,7,1.300546,4,14,2.601093,9.103825
4320,3,21954,4.0,3.25,3610.0,4000.0,2.0,2640,970,2007,0,14,0,0,0,0,1,0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,14,1.108033,4,28,2.216066,15.512465
4321,3,21954,3.0,2.50,1510.0,1618.0,2.5,1330,180,2011,0,10,0,0,0,0,0,0,0,0,0,1,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,10,1.071523,4,20,2.143046,10.715232


In [None]:
# transformed_holdout = final_scaler.transform(ht)

## Step 3: Predict the holdout set

In [55]:
final_answers = final_model.predict(ht[effect_feat])

In [56]:
final_answers

array([269600.37202764, 217728.99407448, 114212.82379751, ...,
       996551.61702679, 572270.13452543, 514045.7564724 ])

In [58]:
df = pd.DataFrame(final_answers)

## Step 4: Export your predictions

In [59]:
df.to_csv('housing_preds_Rafael_ferreira.csv')