# Import Dependencies and Libraries

In [21]:
import graphlab as gl
import matplotlib.pyplot as plt
%matplotlib inline

# Read housing data into an SFrame

In [3]:
houseData = gl.SFrame('home_data.gl/')
houseData

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900,3,1.0,1180,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000,3,2.25,2570,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000,2,1.0,770,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000,4,3.0,1960,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000,3,2.0,1680,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000,4,4.5,5420,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500,3,2.25,1715,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850,3,1.5,1060,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500,3,1.0,1780,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000,3,2.5,1890,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


# 1. Average house price for zip code 98039.

In [4]:
average98039 = houseData['price'][houseData['zipcode']=='98039'].mean()
average98039

2160606.6000000006

# 2. Get houses with sqft_living > 2,000 and <= 4,000

In [5]:
housesBetween2000and4000 = houseData[(houseData['sqft_living']>2000) & (houseData['sqft_living']<=4000)]
float(housesBetween2000and4000.shape[0])/float(houseData.shape[0])

0.42187572294452413

# 3. Create regression model with more features

## a. Split training and test data

In [6]:
trainingData, testData = houseData.random_split(0.8,seed=0)

## b. Define features list

In [7]:
allFeatures = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode','condition',
    'grade','waterfront','view','sqft_above','sqft_basement','yr_built','yr_renovated',
    'lat','long','sqft_living15','sqft_lot15'
]
myFeatures = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

## c. Create regression model from training data and features lists

In [8]:
fullLinearModel = gl.linear_regression.create(trainingData, target='price',features=allFeatures)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [9]:
simpleLinearModel = gl.linear_regression.create(trainingData, target='price',features=myFeatures)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



## d. Evaluate max and rmse error

In [17]:
simpleRMSE = simpleLinearModel.evaluate(testData)['rmse']

In [11]:
fullRMSE = fullLinearModel.evaluate(testData)['rmse']

In [18]:
abs(fullRMSE - simpleRMSE)

22410.443300897372

# Extra stuff

## a. Predict house prices

In [None]:
h1 = houseData[houseData['waterfront']==1]
h1

In [None]:
fullLinearModel.predict(h1)

In [None]:
error = abs(fullLinearModel.predict(h1) - h1['price'])

In [None]:
error.mean()

## b. Get average error in %

In [13]:
fullFeaturePredictions = fullLinearModel.predict(testData)

In [14]:
errorPercentage = abs(fullFeaturePredictions-testData['price'])/testData['price']

In [15]:
errorPercentage.mean()

0.19232143578535033

In [16]:
gl.canvas.set_target('ipynb')
houseData.show(view='Scatter Plot',x='sqft_living',y='price')