# Kings County Housing Prices Bakeoff

Below are a list of steps that you should take while trying to complete your bake-off entry.

## Step 1: Read in Data

In [116]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import norm 
import math
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
import statsmodels.api as sm
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
import fiona
from shapely.geometry import Point, Polygon
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import seaborn as sns
plt.style.use('seaborn')
sns.set(style="white")

In [2]:
hf = pd.read_csv('kc_house_data_train.csv')
zipfile = "Zip_Codes-shp"
street_map = gpd.read_file(zipfile)
crs = {'init': 'epsg:4326'}

In [3]:
len(hf)

17290

In [4]:
features = ['zipcode','bedrooms', 'bathrooms',
       'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition',
       'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15', 'sqft_lot15']

hf_features = hf[features]
target = hf.price

In [5]:
hf

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,2591820310,20141006T000000,365000.0,4,2.25,2070,8893,2.0,0,...,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,1,7974200820,20140821T000000,865000.0,5,3.00,2900,6730,1.0,0,...,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283
2,2,7701450110,20140815T000000,1038000.0,4,2.50,3770,10893,2.0,0,...,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685
3,3,9522300010,20150331T000000,1490000.0,3,3.50,4560,14608,2.0,0,...,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226
4,4,9510861140,20140714T000000,711000.0,3,2.50,2550,5376,2.0,0,...,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17285,17285,627300195,20150303T000000,750000.0,5,2.50,3240,9960,1.0,0,...,8,2020,1220,1958,0,98008,47.5858,-122.112,2730,10400
17286,17286,8819900270,20140520T000000,440000.0,2,1.75,1300,4000,2.0,0,...,7,1300,0,1948,0,98105,47.6687,-122.288,1350,4013
17287,17287,3816300095,20140514T000000,310000.0,3,1.00,1050,9876,1.0,0,...,7,1050,0,1953,0,98028,47.7635,-122.262,1760,9403
17288,17288,122069107,20141204T000000,427500.0,3,1.50,1900,43186,1.5,0,...,7,1300,600,1971,0,98038,47.4199,-121.990,2080,108028


In [6]:
zip_grade = pd.read_csv('Niche.csv')
hf = zip_grade.set_index('zipcode').join(hf.set_index('zipcode'))


In [7]:
hf = hf.reset_index()

In [8]:
hf = hf.dropna(subset = ['id'])

In [9]:
len(hf)

17290

In [10]:
hf['yr_built']

0        2014
1        2004
2        1967
3        1908
4        1909
         ... 
17285    1942
17286    1951
17287    1948
17288    1936
17289    2012
Name: yr_built, Length: 17290, dtype: int64

In [11]:
hf['bedrooms'] .value_counts()

3     7864
4     5488
2     2204
5     1283
6      229
1      160
7       30
0       12
8       10
9        5
10       3
11       1
33       1
Name: bedrooms, dtype: int64

## Step 2: Exploratory Data Analysis 
    
Become familiar with the data.  Look to see if there are any extreme values.  

Additionally create data visualizations to determine if there are any relationships between your features and your target variables.  

In [None]:
# Plot Histogram
sns.distplot(hf['price'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(hf['price'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(hf['price'], plot=plt)
plt.show()

print("Skewness: %f" % hf['price'].skew())
print("Kurtosis: %f" % hf['price'].kurt())

In [None]:
fig, ax = plt.subplots(figsize = (15,15))
street_map.plot(ax = ax)

In [None]:
hf.columns

In [None]:
sqliv = hf.loc[hf['sqft_living'] <= 6850]
len(sqliv)

In [None]:
sqlot = hf.loc[hf['sqft_lot'] <= 425000]
len(sqlot)

In [None]:
floors = hf.loc[hf['floors'] <= 3.0]
len(floors)

In [None]:
view = hf.loc[hf['view'] >= 3]
len(view)

In [None]:
cond = hf.loc[hf['condition'] >= 5 ]
len(cond)

In [None]:
geometry = [Point(xy) for xy in zip(hf['long'], hf['lat'])]
geometry[:3]

In [None]:
sqabove = hf.loc[hf['sqft_above'] <= 5800]
len(sqabove)

In [None]:
sqliv15 = hf.loc[hf['sqft_living15'] <= 4820]
len(sqliv15)

In [None]:
sqlot15 = hf.loc[hf['sqft_lot15'] <= 250000]
len(sqlot15)

In [None]:
sqbase = hf.loc[hf['sqft_basement'] <= 2180]
len(sqbase)

In [None]:
hf == 0

In [None]:
hf[hf['bathrooms'] >= 5.5]

In [None]:
geo_df = gpd.GeoDataFrame(hf,
                         crs = crs,
                         geometry = geometry)
geo_df.columns

In [None]:
fig, ax = plt.subplots(figsize = (15,15))
street_map.plot(ax = ax, alpha = 0.4, color = 'grey')
# geo_df[geo_df['Unnamed: 0'] >= 1000].plot(ax=ax, markersize = 20, color = 'blue', marker = "^", label = "Housing ID")
# geo_df[geo_df['Unnamed: 0'] <= 1000].plot(ax=ax, markersize = 20, color = 'red', marker = "o", label = "Housing ID")
geo_df[geo_df['price'] <= 7000000].plot(ax=ax, markersize = 20, color = 'green', marker = "+", label = "Housing ID")




plt.legend(prop={'size': 15})

In [None]:
hf.shape

In [None]:
hf[500:540]

In [None]:
hf.corr()

In [None]:
plt.style.use('seaborn')
sns.set(style="white")

# Set up  matplotlib figure (might have to play around with the 
# figsize if your labels aren't so legible and you don't want
# to mess with the labels using matplotlib)
f, ax = plt.subplots(figsize=(10, 9))

# Create an upper triangular matrix to use to get rid of duplicate/
# useless values
mask = np.zeros_like(hf.corr())
mask[np.triu_indices_from(mask)] = True

# plot the heatmap
with sns.axes_style("white"):
    ax = sns.heatmap(hf.corr(), mask=mask, square=True)
    
# fix for mpl bug that cuts off top/bottom of seaborn viz
# credit: https://github.com/mwaskom/seaborn/issues/1773 SalMac86's post
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.show() # ta-da!

In [None]:
price_corr = hf[hf.columns[:]].corr()['price'][:] 
price_corr.sort_values()

In [None]:
plt.scatter(hf['bedrooms'], hf['bathrooms'], marker ='x');

In [None]:
plt.scatter(hf['price'], hf['bedrooms'], marker = '^')

In [None]:
plt.scatter(hf['condition'], hf['grade'], marker = '^')

In [None]:
plt.plot(np.unique(hf['price']), np.poly1d(np.polyfit(hf['price'], hf['bedrooms'], 1))(np.unique(hf['price'])))

In [None]:

sns.regplot(hf['price'],hf['bedrooms'], scatter_kws={"color": "black"}, line_kws={"color": "red"})

In [None]:
sns.regplot(hf['price'],hf['bathrooms'], scatter_kws={"color": "black"}, line_kws={"color": "red"})

In [None]:
hf[hf['sqft_living'] >= 12000]

In [None]:
hf['yr_renovated'].unique()

In [None]:
fig, axes = plt.subplots(7,2, figsize = (20, 20))


sns.boxplot(ax = axes[0,0], data = hf['bedrooms'], orient = 'h')
sns.boxplot(ax = axes[0,1], data = hf['bathrooms'], orient = 'h')
sns.boxplot(ax = axes[1,0], data = hf['price'], orient = 'h')
sns.boxplot(ax = axes[1,1], data = hf['sqft_living'], orient = 'h')
sns.boxplot(ax = axes[2,0], data = hf['sqft_lot'], orient = 'h')
sns.boxplot(ax = axes[2,1], data = hf['floors'], orient = 'h')
sns.boxplot(ax = axes[3,0], data = hf['grade'], orient = 'h')
sns.boxplot(ax = axes[3,1], data = hf['sqft_above'], orient = 'h')
sns.boxplot(ax = axes[4,0], data = hf['sqft_basement'], orient = 'h')
sns.boxplot(ax = axes[4,1], data = hf['yr_built'], orient = 'h')
sns.boxplot(ax = axes[5,0], data = hf['yr_renovated'], orient = 'h')
sns.boxplot(ax = axes[5,1], data = hf['zipcode'], orient = 'h')
# sns.boxplot(ax = axes[6,0], data = hf['sqft_living15'], orient = 'h')
# sns.boxplot(ax = axes[6,1], data = hf['sqft_lot15'], orient = 'h')


plt.show

In [None]:
hf.describe()

In [None]:
a = hf.zipcode.unique()
hf['geometry']

In [None]:
print(sorted(a))

In [None]:
hf.keys()

In [None]:
numerical = ['price', 'sqft_living', 'sqft_lot', 'view',
             'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15', 'sqft_lot15'
    
]

categorical = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'condition', 'grade', 'zipcode'
    
]

houses = hf[numerical + categorical]

houses.shape

In [None]:
sns.set_context('notebook', font_scale = 1.4)

sns.distplot(
    houses['price'], norm_hist=False, kde=False, bins=20, hist_kws={"alpha": 1}
).set(xlabel='price', ylabel='Count');

In [None]:
houses[numerical].hist(bins=15, figsize=(30, 50), layout=(13, 2));


In [None]:
fig, ax = plt.subplots(2, 4, figsize=(30, 20))
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(hf[variable], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(90)

In [None]:
sns.jointplot(x=hf['sqft_living'], y=hf['sqft_living15']);


In [None]:
sns.jointplot(x=hf['price'], y=hf['yr_built']);


In [None]:

fig, ax = plt.subplots(4, 2, figsize=(30, 20))
for var, subplot in zip(categorical, ax.flatten()):
    sns.boxplot(x=var, y='price', data=hf, ax=subplot)

In [None]:
sorted_nb = hf.groupby(['zipcode'])['price'].median().sort_values()
sns.boxplot(x=hf['zipcode'], y=hf['price'], order=list(sorted_nb.index))

In [None]:
def correlation_heatmap(hf1):
    _,ax=plt.subplots(figsize=(25,20))
    colormap=sns.diverging_palette(220,10,as_cmap=True)
    sns.heatmap(hf.corr(),annot=True,cmap=colormap)
    
correlation_heatmap(hf)

In [None]:
fig,axes=plt.subplots(nrows=1,ncols=1,figsize=(20,15))
plt.title("house prices by sqft_above")
plt.xlabel('sqft_above')
plt.ylabel('house prices')
sns.barplot(x='sqft_above',y='price',data=hf)

In [None]:
nobed = (hf['bedrooms'] == 0 )

In [None]:
hf[(hf['bathrooms'] == 2)]

#avgbathrooms = hf.loc[hf['bathrooms'] == 2].mean('bedrooms')
avgbathrooms =  hf.groupby(hf['bathrooms'] == 1)['bedrooms'].mean()
avgbathrooms[True]

In [None]:
avgbathrooms =  hf.groupby('bathrooms', as_index=False)['bedrooms'].mean()
avgbathrooms

In [None]:
(hf['sqft_living'] == 0)

## Step 3: Clean up any issues (extreme values, etc.) with the data.  

Remember that you can't just delete rows with extreme values. Similar observations might be present in the holdout data set, and you can't just delete those rows and not have a prediction for it. 

In [12]:
sq_lot_mean = hf['sqft_lot'].mean()
sq_lot_std3 = hf['sqft_lot'].std()*3
sq_lot_mstd = sq_lot_mean + sq_lot_std3

sq_living_mean = hf['sqft_living'].mean()
sq_living_std3 = hf['sqft_living'].std()*3
sq_living_mstd = sq_living_mean + sq_living_std3

sq_above_mean = hf['sqft_above'].mean()
sq_above_std3 = hf['sqft_above'].std()*3
sq_above_mstd = sq_above_mean + sq_above_std3

sq_base_mean = hf['sqft_basement'].mean()
sq_base_std3 = hf['sqft_basement'].std()*3
sq_base_mstd = sq_base_mean + sq_base_std3

In [13]:
def cap_sqft(row):
    if row['sqft_lot'] > sq_lot_mstd:
        row['sqft_lot'] = sq_lot_mstd
    if row['sqft_living'] > sq_living_mstd:
        row['sqft_living'] = sq_living_mstd
    if row['sqft_above'] > sq_above_mstd :
        row['sqft_above'] = sq_above_mstd
    if row['sqft_basement'] > sq_base_mstd :
        row['sqft_basement'] = sq_base_mstd 
    return row

In [14]:
hf = hf.apply(cap_sqft, axis = 1)

In [15]:
hf.shape

(17290, 26)

In [19]:
def zero_val_bed_bath(row):
    """
    Checking extreame number of rooms in the house
    """
    if row['bedrooms'] == 0:
        row['bedrooms'] = row['floors']
    if row['bathrooms'] < 1:
        row['bathrooms'] = 1
    if row['bedrooms'] > 10 :
        row['bedrooms'] = 10
    return row

In [20]:
hf = hf.apply(zero_val_bed_bath, axis = 1)

In [22]:
hf.shape

(17290, 26)

In [25]:
hf.drop(columns = ['zip_rank', 'Unnamed: 0', 'id', 'view', 'sqft_living15', 'sqft_lot15'], inplace = True)

In [None]:
features = ['bathrooms', 'population', 'yr_renovated', 'population',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'sqft_above', 'sqft_basement', 'grade',
       'yr_built', 'yr_renovated']

In [None]:
model_test(hf, features, target)

In [26]:
def define_niche_grade(row):
    
    if row['niche_grade'] == 'A+':
        row['niche_grade'] = 1
    if row['niche_grade'] == 'A+ ':
        row['niche_grade'] = 1
    if row['niche_grade'] == 'A':
        row['niche_grade'] = 2
    if row['niche_grade'] == 'A-':
        row['niche_grade'] = 3
    if row['niche_grade'] == 'B+':
        row['niche_grade'] = 4
    if row['niche_grade'] == 'B':
        row['niche_grade'] = 5
    if row['niche_grade'] == 'B-':
        row['niche_grade'] = 6
    
    return row 

In [27]:
hf = hf.apply(define_niche_grade, axis = 1)

In [None]:
hf['niche_grade'].unique()

In [None]:
features = ['niche_grade', 'bathrooms', 'population', 'yr_renovated', 'population',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'sqft_above', 'sqft_basement', 'grade',
       'yr_built', 'yr_renovated']

hf_features = hf[features]
target = hf['price']

In [None]:
model_test(hf, features, target)

In [28]:
def define_school_grade(row):
    
    if row['school_grade'] == 'A+':
        row['school_grade'] = 1
    if row['school_grade'] == 'A+ ':
        row['school_grade'] = 1
    if row['school_grade'] == 'A ':
        row['school_grade'] = 2
    if row['school_grade'] == 'A':
        row['school_grade'] = 2
    if row['school_grade'] == 'A-':
        row['school_grade'] = 3
    if row['school_grade'] == 'A- ':
        row['school_grade'] = 3
    if row['school_grade'] == 'B+':
        row['school_grade'] = 4
    if row['school_grade'] == 'B':
        row['school_grade'] = 5
    if row['school_grade'] == 'B-':
        row['school_grade'] = 6
    if row['school_grade'] == 'C+':
        row['school_grade'] = 7
    
    return row 

In [29]:
hf = hf.apply(define_school_grade, axis = 1)

In [30]:
hf['school_grade'].unique()

array([6, 5, 1, 4, 3, 2, 7])

In [31]:
features = ['niche_grade', 'school_grade', 'bathrooms', 'population', 'yr_renovated', 'population',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'sqft_above', 'sqft_basement', 'grade',
       'yr_built', 'yr_renovated']

hf_features = hf[features]
target = hf['price']

In [None]:
model_test(hf, features, target)

In [32]:
hf.shape

(17290, 20)

## Step 4: Generate new features that you think could be important.

After doing this, you will want to go back to steps 2 and 3 to investigate these new features.

In [33]:
hf['yr_updated'] = np.nan

In [34]:
def yr_update(row):
    
    if row['yr_renovated'] == 0:
        row['yr_updated'] = 2021 - row['yr_built']
    if row['yr_renovated'] != 0:
        row['yr_updated'] = 2021 - row['yr_built']
        
    return row 
    

In [36]:
hf = hf.apply(yr_update, axis = 1)

In [None]:
features = ['niche_grade', 'school_grade', 'bathrooms', 'population', 'yr_renovated', 'population',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'sqft_above', 'sqft_basement', 'grade',
       'yr_built', 'yr_renovated', 'yr_updated']

hf_features = hf[features]
target = hf['price']

In [None]:
model_test(hf, features, target)

In [37]:
hf['price_per_sqft'] = np.nan

In [38]:
def price_per_sqft(row):
    price_sqft = row['price'] / row['sqft_living']
    row['price_per_sqft'] = price_sqft
    
    return row

In [39]:
hf = hf.apply(price_per_sqft, axis = 1)

In [None]:
features = ['niche_grade', 'school_grade', 'bathrooms', 'population', 'yr_renovated', 'population',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'sqft_above', 'sqft_basement', 'grade',
       'yr_built', 'yr_renovated', 'yr_updated', 'price_per_sqft']

hf_features = hf[features]
target = hf['price']

In [None]:
model_test(hf, features, target)

In [40]:
hf['percent_bedbath'] = np.nan
hf['has_golden_ratio'] = np.nan

In [41]:
# Based off of bathrooms pros and housetipster 

def ratio_bed_bath(row):
    
    ratio_bed_bath = row['bathrooms'] / row['bedrooms']
    golden_ratio = (2/3)
    row['percent_bedbath'] = abs(golden_ratio - ratio_bed_bath) 
    
    return row

In [42]:
hf = hf.apply(ratio_bed_bath, axis = 1)

In [None]:
features = ['niche_grade', 'school_grade', 'bathrooms', 'population', 'yr_renovated', 'population',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'sqft_above', 'sqft_basement', 'grade'
       'yr_built', 'yr_renovated', 'yr_updated','price_per_sqft', 'percent_bedbath']

hf_features = hf[features]
target = hf['price']

In [None]:
model_test(hf, features, target)

In [43]:
def cal_ratio_range(row):
    golden_ratio = (2/3)
    golden_ratio_plus = golden_ratio + (golden_ratio * .10)
    golden_ratio_minus = golden_ratio - (golden_ratio * .10)
    
    if row['percent_bedbath'] <= golden_ratio_plus and row['percent_bedbath'] >= golden_ratio_minus:
            row['has_golden_ratio'] = 1
    else:
        row['has_golden_ratio'] = 0
    
    return row

In [44]:
hf = hf.apply(cal_ratio_range, axis = 1)

In [46]:
features = ['niche_grade', 'school_grade', 'bathrooms', 'population', 'yr_renovated', 'population',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'sqft_above', 'sqft_basement', 'grade',
       'yr_built', 'yr_renovated', 'yr_updated', 'price_per_sqft', 'percent_bedbath', 'has_golden_ratio']

hf_features = hf[features]
target = hf['price']

In [None]:
model_test(hf, features, target)

In [47]:
hf['ratio_liv_lot'] = np.nan

In [48]:
def ratio_living_lot(row):
    
    row['ratio_liv_lot'] = row['sqft_lot'] / row['sqft_living']
    return row
    

In [49]:
hf = hf.apply(ratio_living_lot, axis = 1)

### 4.1) Identify a categorical variable in the data set and create dummy columns.

In [None]:
# your code here
# dummy variable for grade 
#

In [50]:
hf = pd.concat([hf, pd.get_dummies(hf['grade'])], 1)

In [51]:
hf.columns = hf.columns.astype(str)

In [None]:
# lowest g: 1
# low g: 3 Falls short of minimum building standards. Normally cabin or inferior structure.

# dnmc: 4 Generally older, low quality construction. Does not meet code.

# poor: 5 Low construction costs and workmanship. Small, simple design.

# bare_min: 6 Lowest grade currently meeting building code. Low quality materials and simple designs.

# average: 7 Average grade of construction and design. Commonly seen in plats and older sub-divisions.

# above_avg: 8 Just above average in construction and design. Usually better materials in both the exterior and interior finish work.

# good: 9 Better architectural design with extra interior and exterior design and quality.

# high_qua: 10 Homes of this quality generally have high quality features. Finish work is better and more design quality is seen in the floor plans. Generally have a larger square footage.

# higher_qua: 11 Custom design and higher quality finish work with added amenities of solid woods, bathroom fixtures and more luxurious options.

# excellent qua: 12 Custom design and excellent builders. All materials are of the highest quality and all conveniences are present.

# mansion: 13 Generally custom designed and built. Mansion level. Large amount of highest quality cabinet work, wood trim, marble, entry ways etc.

In [52]:
hf = hf.rename(columns={'1': 'lowest_g', '3': 'low_g', '4':'dnmc', '5':'Poor', '6':'bare_min', 
                   '7':'average', '8':'above_avg', '9':'good', '10':'high_qua', '11':'higher_qua',
                   '12':'excellent_qua', '13':'mansion' })

In [None]:
hf.keys()

In [None]:
features = ['niche_grade', 'school_grade', 'bathrooms', 'population', 'yr_renovated', 'population',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'yr_updated', 'price_per_sqft', 'percent_bedbath', 'has_golden_ratio', 'lowest_g',
       'low_g', 'dnmc', 'Poor', 'bare_min', 'average', 'above_avg', 'good',
       'high_qua', 'higher_qua', 'excellent_qua', 'mansion']

hf_features = hf[features]
target = hf['price']

### 4.2) There is a column that gives the date for when the house was sold, how could this be useful in your model? How might you transform the current column to a more useful feature?

In [None]:
# your code here
hf['date'] = pd.to_datetime(hf['date'], format = '%Y/%m/%d')

In [None]:
hf['price_per_sqft'][:30]

In [None]:
hf.columns = hf.columns.astype(str)

In [None]:
hf.columns

### 4.3) There are columns for when the house was built and when it was renovated.  How could you use these columns to create a new column?

In [None]:
#your code here 

In [None]:
hf['price']

### <ins>Non-linear transformations</ins>

### 4.4) Create a polynomial feature for two of your continuous variables.

In [53]:
features = ['niche_grade', 'yr_updated', 'ratio_liv_lot']

hf_features = hf[features]
target = hf['price']

In [54]:
# your code here
poly = PolynomialFeatures(degree=2, include_bias=False)

In [55]:
poly_data = poly.fit_transform(hf_features)

In [56]:
poly_columns = poly.get_feature_names(hf_features.columns)

In [57]:
df_poly = pd.DataFrame(poly_data, columns=poly_columns)

In [58]:
df_poly.columns

Index(['niche_grade', 'yr_updated', 'ratio_liv_lot', 'niche_grade^2',
       'niche_grade yr_updated', 'niche_grade ratio_liv_lot', 'yr_updated^2',
       'yr_updated ratio_liv_lot', 'ratio_liv_lot^2'],
      dtype='object')

In [59]:
lm_2 = LinearRegression()

#fit the linear regression to the data
lm_2 = lm_2.fit(df_poly, target)

In [60]:
lm_2.score(df_poly, target)

0.2529775096698009

In [61]:
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(df_poly, target, random_state=34,test_size=0.2)


In [62]:
#instantiate a linear regression object
lr_poly = LinearRegression()

#fit the linear regression to the data
lr_poly = lr_poly.fit(X_train_poly, y_train_poly)

In [63]:
train_preds = lr_poly.predict(X_train_poly)

In [64]:
train_rmse_poly = np.sqrt(metrics.mean_squared_error(y_train_poly, train_preds))

print('Root Mean Squared Error:' , train_rmse_poly)

Root Mean Squared Error: 323005.60230211547


### 4.5) Create an interaction feature between a binary variable (dummy variable) and a continuous variable.

In [65]:
# your code here
# waterfront times sqft_lot
hf = pd.concat([hf, pd.get_dummies(hf['waterfront'])], 1)

In [66]:
hf = hf.rename(columns={0: "No_Waterfront", 1: "Waterfront"})

In [None]:
hf

In [67]:
hf['water_sqft_lot'] = np.nan

In [68]:
def water_lot(row):
    if row['waterfront'] == 1:
        row['water_sqft_lot'] = row['Waterfront'] * row['sqft_lot'] 
    if row['waterfront'] == 0:
        row['water_sqft_lot'] = 0
    return row

In [69]:
hf = hf.apply(water_lot, axis=1)

In [None]:
hf.drop(columns='waterfront')

In [223]:
features = ['population','bedrooms', 'bathrooms',
       'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'condition',
       'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
        'yr_updated', 'percent_bedbath', 'has_golden_ratio',  'lowest_g',
       'low_g', 'dnmc', 'Poor', 'bare_min', 'average', 'above_avg', 'good',
       'high_qua', 'higher_qua', 'excellent_qua', 'mansion', 'No_Waterfront', 'Waterfront', 'water_sqft_lot', 'ratio_liv_lot']

hf_features = hf[features]
target = hf.price

In [None]:
model_test(hf, features, target)

## Step 5: Train-Test Split

If you plan on doing any scaling of your data, make sure it is done at the appropriate time. 

In [70]:
hf.columns

Index(['zipcode', 'niche_grade', 'school_grade', 'population', 'date', 'price',
       'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'condition', 'grade', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'lat', 'long', 'yr_updated',
       'price_per_sqft', 'percent_bedbath', 'has_golden_ratio',
       'ratio_liv_lot', 'lowest_g', 'low_g', 'dnmc', 'Poor', 'bare_min',
       'average', 'above_avg', 'good', 'high_qua', 'higher_qua',
       'excellent_qua', 'mansion', 'No_Waterfront', 'Waterfront',
       'water_sqft_lot'],
      dtype='object')

In [229]:
features = [ 'niche_grade', 'school_grade','population','bedrooms', 'bathrooms',
       'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'condition',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 
        'yr_updated', 'percent_bedbath', 'has_golden_ratio', 'ratio_liv_lot', 'lowest_g',
       'low_g', 'dnmc', 'Poor', 'bare_min', 'average', 'above_avg', 'good',
       'high_qua', 'higher_qua', 'excellent_qua', 'mansion', 'No_Waterfront', 
            'Waterfront', 'water_sqft_lot']

hf_features = hf[features]
target = hf.price

In [230]:
hf_features
target.unique()

array([309000., 286651., 260000., ..., 671000., 827235., 557800.])

### 5.1) Perform a train-test split of the data.

In [231]:
def model_test(df, features, target):
    df_features = df[features]
    X_train, X_test, y_train, y_test = train_test_split(df_features, target, random_state=34,test_size=0.2)
    #instantiate a linear regression object
    lm = linear_model.LinearRegression()

    #fit the linear regression to the data
    lm = lm.fit(X_train, y_train)
    
    y_train_pred = lm.predict(X_train)
    
    train_mae = metrics.mean_absolute_error(y_train, y_train_pred)
    train_mse = metrics.mean_squared_error(y_train, y_train_pred)
    train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
   
    # Test Set
    y_pred = lm.predict(X_test)
    
    #test_mae = metrics.mean_absolute_error(y_test, y_pred)
    test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    #print('Mean Absolute Error:' + str(metrics.mean_absolute_error(y_test, y_pred)))
    #print('Mean Squared Error:' + str(metrics.mean_squared_error(y_test, y_pred)))
    print('Training: ', int(train_rmse), "vs. Testing: ", int(test_rmse))

In [232]:
model_test(hf, features, target)

Training:  189808 vs. Testing:  185552


### 5.2) Fit your scaler to training the data.

In [241]:
features = ['school_grade', 'population', 'bedrooms', 'bathrooms',
       'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'condition',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'percent_bedbath', 'has_golden_ratio', 'lowest_g',
       'low_g', 'dnmc', 'Poor', 'bare_min', 'average', 'above_avg', 'good',
       'high_qua', 'higher_qua', 'excellent_qua', 'mansion', 'No_Waterfront',
       'Waterfront', 'water_sqft_lot']

In [242]:
scale_df = pd.concat([hf[features], df_poly], 1)

In [243]:
scale_df

Unnamed: 0,school_grade,population,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,sqft_above,...,water_sqft_lot,niche_grade,yr_updated,ratio_liv_lot,niche_grade^2,niche_grade yr_updated,niche_grade ratio_liv_lot,yr_updated^2,yr_updated ratio_liv_lot,ratio_liv_lot^2
0,6,32625,5.0,2.75,2481.0,4045.0,2.0,0,3,2481.0,...,0.0,5.0,7.0,1.630391,25.0,35.0,8.151955,49.0,11.412737,2.658175
1,6,32625,3.0,2.50,1830.0,4997.0,2.0,0,3,1830.0,...,0.0,5.0,17.0,2.730601,25.0,85.0,13.653005,289.0,46.420219,7.456182
2,6,32625,5.0,2.25,2320.0,6375.0,1.0,0,4,1270.0,...,0.0,5.0,54.0,2.747845,25.0,270.0,13.739224,2916.0,148.383621,7.550651
3,6,32625,3.0,1.00,1150.0,19200.0,1.0,0,4,1150.0,...,0.0,5.0,113.0,16.695652,25.0,565.0,83.478261,12769.0,1886.608696,278.744802
4,6,32625,3.0,1.00,940.0,10890.0,1.0,0,4,940.0,...,0.0,5.0,112.0,11.585106,25.0,560.0,57.925532,12544.0,1297.531915,134.214690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17285,3,21954,2.0,1.00,810.0,6480.0,1.0,0,5,810.0,...,0.0,2.0,79.0,8.000000,4.0,158.0,16.000000,6241.0,632.000000,64.000000
17286,3,21954,4.0,2.50,3690.0,11191.0,1.0,0,4,2190.0,...,0.0,2.0,70.0,3.032791,4.0,140.0,6.065583,4900.0,212.295393,9.197823
17287,3,21954,3.0,2.50,3560.0,8297.0,1.0,0,4,1650.0,...,0.0,2.0,73.0,2.330618,4.0,146.0,4.661236,5329.0,170.135112,5.431780
17288,3,21954,2.0,2.50,2720.0,4913.0,1.0,0,4,1700.0,...,0.0,2.0,85.0,1.806250,4.0,170.0,3.612500,7225.0,153.531250,3.262539


In [244]:
X_train, X_test, y_train, y_test = train_test_split(scale_df, target, random_state=34,test_size=0.2)

In [245]:
X_train.select_dtypes(include=["number"]).columns

Index(['school_grade', 'population', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'percent_bedbath',
       'has_golden_ratio', 'lowest_g', 'low_g', 'dnmc', 'Poor', 'bare_min',
       'average', 'above_avg', 'good', 'high_qua', 'higher_qua',
       'excellent_qua', 'mansion', 'No_Waterfront', 'Waterfront',
       'water_sqft_lot', 'niche_grade', 'yr_updated', 'ratio_liv_lot',
       'niche_grade^2', 'niche_grade yr_updated', 'niche_grade ratio_liv_lot',
       'yr_updated^2', 'yr_updated ratio_liv_lot', 'ratio_liv_lot^2'],
      dtype='object')

In [246]:
['condition',
 'percent_bedbath',
 'lowest_g',
 'low_g',
 'dnmc',
 'above_avg',
 'ratio_liv_lot',
 'yr_updated^2',
 'ratio_liv_lot^2']

['condition',
 'percent_bedbath',
 'lowest_g',
 'low_g',
 'dnmc',
 'above_avg',
 'ratio_liv_lot',
 'yr_updated^2',
 'ratio_liv_lot^2']

In [247]:
#your code here 
scaled_features = [ 'niche_grade', 'school_grade','population',
       'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built',
        'yr_renovated',  'yr_updated', 'has_golden_ratio', 'Poor', 'bare_min', 'average', 'good',
       'high_qua', 'higher_qua', 'excellent_qua', 'mansion', 'water_sqft_lot']

In [248]:
not_scaled = [x for x in X_train.columns if x not in scaled_features]
not_scaled

['bedrooms',
 'bathrooms',
 'floors',
 'waterfront',
 'condition',
 'percent_bedbath',
 'lowest_g',
 'low_g',
 'dnmc',
 'above_avg',
 'No_Waterfront',
 'Waterfront',
 'ratio_liv_lot',
 'niche_grade^2',
 'niche_grade yr_updated',
 'niche_grade ratio_liv_lot',
 'yr_updated^2',
 'yr_updated ratio_liv_lot',
 'ratio_liv_lot^2']

In [249]:
scaler = StandardScaler()

# fit the scaler to the training data
scaler.fit(X_train[scaled_features])

#transform the training data
scaled_data = scaler.transform(X_train[scaled_features])

In [250]:
X_train.shape

(13832, 39)

In [251]:
X_train.columns

Index(['school_grade', 'population', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'percent_bedbath',
       'has_golden_ratio', 'lowest_g', 'low_g', 'dnmc', 'Poor', 'bare_min',
       'average', 'above_avg', 'good', 'high_qua', 'higher_qua',
       'excellent_qua', 'mansion', 'No_Waterfront', 'Waterfront',
       'water_sqft_lot', 'niche_grade', 'yr_updated', 'ratio_liv_lot',
       'niche_grade^2', 'niche_grade yr_updated', 'niche_grade ratio_liv_lot',
       'yr_updated^2', 'yr_updated ratio_liv_lot', 'ratio_liv_lot^2'],
      dtype='object')

In [252]:
scaled_data.shape

(13832, 20)

In [253]:
len(scaled_features)

20

In [254]:
X_train_scaled = pd.DataFrame(data=scaled_data, columns=scaled_features, index=X_train.index)

In [255]:
X_train = pd.concat([X_train_scaled, X_train[not_scaled]], axis=1)

### 5.3) Transform the testing set with the scaler.

In [256]:
#your code here 
scaled_test_data = scaler.transform(X_test[scaled_features])

In [257]:
X_test_scaled = pd.DataFrame(data=scaled_test_data, columns=scaled_features, index=X_test.index)

In [258]:
X_test = pd.concat([X_test_scaled, X_test[not_scaled]], axis=1)

### 5.4) Fit the model to the training data.

In [259]:
#your code here 
#your code here 
#instantiate a linear regression object
lm = LinearRegression()

#fit the linear regression to the data
lm = lm.fit(X_train, y_train)

### 5.5) Use the model to predict on the training set and the test set.

In [260]:
#your code here 
y_train_pred = lm.predict(X_train)

y_test_pred = lm.predict(X_test)

### 5.6) Evaluate the training and test predictions using RMSE.

In [261]:
#your code here 
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))

test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

print('Training Root Mean Squared Error:' , train_rmse)
print('Training Root Mean Squared Error:' , test_rmse)

Training Root Mean Squared Error: 186217.86597560364
Training Root Mean Squared Error: 182021.65584622513


### 5.7) Determine if your model is overfit.

In [192]:
#your code here 

## Step 6: Utilize some different feature selection techniques before or in conjuction with fitting your models.

### 6.1) Utilize a filter method to identify some features to remove from the model.  

In [262]:
# your code here
#F-test
X_train.shape

(13832, 39)

In [276]:
selector = SelectKBest(f_regression, k=30)

selector.fit(X_train, y_train)

SelectKBest(k=30, score_func=<function f_regression at 0x7fd0cd403a60>)

In [277]:
selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]

In [278]:
list(removed_columns)

['condition',
 'percent_bedbath',
 'lowest_g',
 'low_g',
 'dnmc',
 'above_avg',
 'ratio_liv_lot',
 'yr_updated^2',
 'ratio_liv_lot^2']

In [279]:
list(selected_columns)

['niche_grade',
 'school_grade',
 'population',
 'sqft_living',
 'sqft_lot',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'yr_updated',
 'has_golden_ratio',
 'Poor',
 'bare_min',
 'average',
 'good',
 'high_qua',
 'higher_qua',
 'excellent_qua',
 'mansion',
 'water_sqft_lot',
 'bedrooms',
 'bathrooms',
 'floors',
 'waterfront',
 'No_Waterfront',
 'Waterfront',
 'niche_grade^2',
 'niche_grade yr_updated',
 'niche_grade ratio_liv_lot',
 'yr_updated ratio_liv_lot']

In [280]:
#instantiate a linear regression object
lm_kbest = LinearRegression()

#fit the linear regression to the data
lm_kbest.fit(X_train[selected_columns], y_train)

y_train_kbest = lm_kbest.predict(X_train[selected_columns])


trainK_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_kbest))


print('Training Root Mean Squared Error:' , trainK_rmse)

y_kbest = lm_kbest.predict(X_test[selected_columns])

testK_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_kbest))

print('Testing Root Mean Squared Error:' , testK_rmse)

Training Root Mean Squared Error: 187527.62787475027
Testing Root Mean Squared Error: 183226.91543698692


In [268]:
#REFECV
from sklearn.feature_selection import RFECV

In [269]:
ols = linear_model.LinearRegression()

In [270]:
# Create recursive feature eliminator that scores features by mean squared errors
selector = RFECV(estimator=ols, step=2, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

# Fit recursive feature eliminator 
selector.fit(X_train, y_train)


RFECV(cv=5, estimator=LinearRegression(), n_jobs=-1,
      scoring='neg_mean_squared_error', step=2, verbose=1)

In [271]:
selected_rfe = X_train.columns[selector.support_]
removed_rfe = X_train.columns[~selector.support_]

In [272]:
selected_rfe

Index(['niche_grade', 'school_grade', 'population', 'sqft_living', 'sqft_lot',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'yr_updated',
       'has_golden_ratio', 'Poor', 'bare_min', 'average', 'good', 'high_qua',
       'higher_qua', 'excellent_qua', 'mansion', 'water_sqft_lot', 'bedrooms',
       'bathrooms', 'floors', 'waterfront', 'condition', 'percent_bedbath',
       'lowest_g', 'low_g', 'dnmc', 'above_avg', 'No_Waterfront', 'Waterfront',
       'ratio_liv_lot', 'niche_grade^2', 'niche_grade yr_updated',
       'niche_grade ratio_liv_lot', 'yr_updated^2', 'yr_updated ratio_liv_lot',
       'ratio_liv_lot^2'],
      dtype='object')

In [273]:
#instantiate a linear regression object
lm_rfe = LinearRegression()

#fit the linear regression to the data
lm_rfe = lm_rfe.fit(X_train[selected_rfe], y_train)

y_rfe = lm_rfe.predict(X_train[selected_rfe])


trainRFE_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_rfe))


print('Training Root Mean Squared Error:' , trainRFE_rmse)

y_pred_rfe = lm_rfe.predict(X_test[selected_rfe])

testRFE_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfe))

print('Testing Root Mean Squared Error:' , testRFE_rmse)



Training Root Mean Squared Error: 186217.86597560364
Testing Root Mean Squared Error: 182021.65584622513


### 6.2) After removing the features, re-run Step 5 and see if your new model performs better than the old model.

In [274]:
effect_feat = ['niche_grade', 'school_grade', 'population', 'sqft_living', 'sqft_lot',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'yr_updated',
       'has_golden_ratio', 'Poor', 'bare_min', 'average', 'good', 'high_qua',
       'higher_qua', 'excellent_qua', 'mansion', 'water_sqft_lot', 'bedrooms',
       'bathrooms', 'floors', 'waterfront', 'condition', 'percent_bedbath',
       'lowest_g', 'low_g', 'dnmc', 'above_avg', 'No_Waterfront', 'Waterfront',
       'ratio_liv_lot', 'niche_grade^2', 'niche_grade yr_updated',
       'niche_grade ratio_liv_lot', 'yr_updated^2', 'yr_updated ratio_liv_lot',
       'ratio_liv_lot^2']

In [275]:
model_test(scale_df, effect_feat, target)

Training:  186217 vs. Testing:  182021


## Step 7: Evaluate your different models in order to determine the best model overall.

## Step 8:  Refit your best model to the entire dataset.

## Step 9: Save your final model using pickle.

https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/