In [1]:
# Implementing the Gradient Descent Algorithm

In [2]:
# Import Graphlab 

In [3]:
import graphlab

In [4]:
# We are using the house data sales and our target is the price
# Download the data

In [5]:
sales = graphlab.SFrame('kc_house_data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to emechebe@ohsu.edu and will expire on June 11, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1471492095.log


In [6]:
# We are going to be doing matrix manipulation
# Python Numpy makes that easy
# So lets import that too

In [7]:
import numpy as np

In [8]:
# Lets remind ourselves the architecture of our data set

In [9]:
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


In [10]:
# Now we want select some features from this data and a target that we want to predict
# So we want to write a function that takes an sframe as the dataset,
# takes a list of features and the target we want to predict. In this case, the price
# This function should return a numpy array of features and another array that contains price

In [11]:
# The Numpy aaray for feature should contain in the first column just 1's which represent the w_o
# The other columns will just correspond to the number of features selected
# The other numpy array should be a 1 dimensional array that contains the target 
# In our case, it is the price.
# This function we called get_numpy_data

In [12]:
def get_numpy_data(data_sframe, features,output):
    # This function takes a data set(data_sframe), a list of features (features) and what you want to predict as a string.
    # It returns back 2 numpy array that has the measurements of your selected features (feature_matrix)
    #  The other array is what you want to predict (output array)
    # Using the data we have, add a constant variable for intercept and select the features you want to use
    data_sframe['constant'] = 1 # add a constant column to an SFrame. This is for intercept
    features = ['constant'] + features  # Prepending the new constant variable to the features also
    features_sframe = data_sframe[features] # Getting the newly formed user selected features Sframe
    features_matrix = features_sframe.to_numpy() # Converting the features Sframe data to a numpy array data
    output_sarray = data_sframe[output]
    output_array = output_sarray.to_numpy()
    return (features_matrix,output_array)

In [13]:
# So the function takes three arguments:
# data_sframe: In our case this is the sales data that we downloaded
# features: Okay we need to make this. To do this we need to agree on the features we want
# output: This is the target that we need to predict

In [16]:
# The features we want to use is the sqft_living and the bedroom 

In [15]:
features = (['sqft_living','bedrooms'])

In [17]:
features

['sqft_living', 'bedrooms']

In [18]:
# Now lets take the function for a ride

In [19]:
get_numpy_data(sales,features,'price')

(array([[  1.00000000e+00,   1.18000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   2.57000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   7.70000000e+02,   2.00000000e+00],
        ..., 
        [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00],
        [  1.00000000e+00,   1.60000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00]]),
 array([ 221900.,  538000.,  180000., ...,  402101.,  400000.,  325000.]))

In [20]:
# Now we have the feature matrix that we want to base our predictions on (in our case we are predicting prices). 
# What we need to predict price is the different weights of the features
# Take those weights and multiply them by the appropriate feature measurements of each instance in the data
# Then take the sum of the products for each instance. That is the predicted price
# We dont have the weights yet(ie regression coefficients). Lets assume that we do for now
# We are going to write a function that takes the feature matrix and the regression coefficients and do the computation
# that was specified above i.e the sumproduct of each instance in the feature matrix
# Numpy can do this easily with the function called dot product

In [21]:
def predict_outcome (feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return predictions

In [23]:
# Now lets get the derivatives.
# The derivatives is the errors(Prediction - Actual) multiplied by the inputs of the features(measurements)
# The np.dot function does that matrix multiplication and the square that 
# That is the derivative
# So you basically take the error and multiply it to all the measurements of an observations
# Then you sum that row to get the sumproduct of that observation.
# Then you multiply by 2 and this gives you the derivatives and thats a 1 D array
# 2*SUM[ error*[feature_i] ] (This is essentially whats implemented below)

In [24]:
def feature_derivative (errors, feature):
    Dotproduct = np.dot(errors,feature)
    Derivative = Dotproduct * 2
    return Derivative

In [25]:
# Now we can use the predict_outcome as well as the feature derivative to implement the gradient descent function

In [26]:
def regression_gradient_descent(feature_matrix,output,initial_weights,step_size,tolerance):
    # This function just takes one input, initial weights.
    # However, it calls upon 2 functions get_numpy_data and predict_outcome
    # get_numpy_data supplies the features_matrix and the output_array
    # predict_outcome then uses the features_matrix and the initialized weights to get the predictions
    # Now you have the Predictions as well as what we are trying to predict (output_array)
    # A lil subtraction gives the Errors
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        Predictions=predict_outcome (features_matrix, weights)
        Errors = Predictions - output
        gradient_sum_squares = 0 # This initializes the gradient
        
        for i in range(len(weights)):
            derivative=feature_derivative (Errors, features_matrix[:,i])
            gradient_sum_squares = gradient_sum_squares + (derivative * derivative)
            weights[i] = weights[i] - (step_size*derivative)
        
            
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
            
    return weights

In [28]:
# Ok lets get the feature_matrix. Remenber this is the measurements of all the features you have decided to use
# Also get the out_put array which is the target and in our case this is the price
# Remenber we wrote a function that does that exactly which is called get_numpy_data

In [29]:
# get_numpy_data needs a data set. We wont use the entire data set. Lets split the dataset into 2 :train and test

In [30]:
train_data,test_data = sales.random_split(.8,seed=0)

In [31]:
# The next argument is the target, in our case is the price

In [36]:
output ='price'

In [37]:
# The last argument will be the features: In this example, we use just one sqft_living

In [38]:
features = ['sqft_living']

In [39]:
features_matrix, output = get_numpy_data(train_data,features,output)

In [40]:
# So now we have the features_matrix and the output

In [41]:
features_matrix

array([[  1.00000000e+00,   1.18000000e+03],
       [  1.00000000e+00,   2.57000000e+03],
       [  1.00000000e+00,   7.70000000e+02],
       ..., 
       [  1.00000000e+00,   1.53000000e+03],
       [  1.00000000e+00,   1.60000000e+03],
       [  1.00000000e+00,   1.02000000e+03]])

In [42]:
output

array([ 221900.,  538000.,  180000., ...,  360000.,  400000.,  325000.])

In [43]:
# Finally, we need two parameters, the step size and the tolerance

In [44]:
step_size = 7e-12

In [45]:
tolerance = 2.5e7

In [46]:
# One of our function uses the sqrt function which is in the math library. So import that

In [47]:
from math import sqrt

In [48]:
# Now lets take our gradient descent algorithm for a ride. This should return the weights of the features

In [49]:
simple_weights=regression_gradient_descent(features_matrix,output,initial_weights,step_size,tolerance)

NameError: name 'initial_weights' is not defined

In [50]:
# Oh, we need to initialize our weights. 
# This is the weight that we should get the derivative from and test if the derivative are close to the tolerance

In [51]:
initial_weights = [-47000., 1.]

In [52]:
# Now lets run it again

In [53]:
simple_weights=regression_gradient_descent(features_matrix,output,initial_weights,step_size,tolerance)

In [54]:
simple_weights

array([-46999.88716555,    281.91211912])

In [55]:
# Now lets use the gradient descent algorithm for the test data
# 

In [58]:
features

['sqft_living']

In [59]:
output = 'price'

In [60]:
features_matrix, output_array = get_numpy_data(test_data, features,output)

In [61]:
simple_weights=regression_gradient_descent(features_matrix,output_array,initial_weights,step_size,tolerance)

In [62]:
simple_weights

array([-46999.87880043,    282.35945337])

In [63]:
# Now we have the weights of the features. We can use those weights to predict the target
# To do that, we multiply the weights we just calculated with the input of the feature matrix, sum them up
# That sum is the predicted values from the weights
# Remenber we already wrote a function called predict_outcome that does that exactly
# This function takes the feature_matrix and the weights we got from the gradient algorithm descent

In [64]:
PredictedHousesbasedonModel=predict_outcome (features_matrix, simple_weights)

In [65]:
# Ok we have the predictions. We can look at the predicted values of the first and fourth house

In [66]:
PredictedHousesbasedonModel[1]

785960.50864093238

In [67]:
PredictedHousesbasedonModel[5]

692781.88902884768

In [68]:
# So this is the Predictions

In [69]:
PredictedHousesbasedonModel

array([ 356774.1395186 ,  785960.50864093,  435834.78646219, ...,
        664545.94369185,  605250.45848416,  241006.76363692])

In [70]:
# This is the actual prices

In [71]:
output_array

array([ 310000.,  650000.,  233000., ...,  610685.,  400000.,  402101.])

In [72]:
# We can ask if there are differences. Of course there are

In [73]:
ErrorsInModel = PredictedHousesbasedonModel-output_array 

In [74]:
ErrorsInModel

array([  46774.1395186 ,  135960.50864093,  202834.78646219, ...,
         53860.94369185,  205250.45848416, -161094.23636308])

In [75]:
# These are the errors

In [78]:
# To compute the RSS (residual sum of squares, you just need to square the difference eg square of (Predicted-Actual))
# and then sum them all

In [79]:
RSS = ErrorsInModel ** 2

In [80]:
RSSr = sum(RSS)

In [81]:
RSSr

275395693978314.88

In [82]:
# Cool we just used the sqft_living to predict the prices

In [83]:
# Now instead of using one feature, lets use more than one feature to predict the price

In [84]:
# Lets use 2 features

In [85]:
model_features = ['sqft_living','sqft_living_15']

In [86]:
output ='price'

In [88]:
initial_weights = [-100000, 1, 1] 

In [90]:
model_features = ['sqft_living','sqft_living15']

In [91]:
get_numpy_data(train_data, model_features,output)

(array([[  1.00000000e+00,   1.18000000e+03,   1.34000000e+03],
        [  1.00000000e+00,   2.57000000e+03,   1.69000000e+03],
        [  1.00000000e+00,   7.70000000e+02,   2.72000000e+03],
        ..., 
        [  1.00000000e+00,   1.53000000e+03,   1.53000000e+03],
        [  1.00000000e+00,   1.60000000e+03,   1.41000000e+03],
        [  1.00000000e+00,   1.02000000e+03,   1.02000000e+03]]),
 array([ 221900.,  538000.,  180000., ...,  360000.,  400000.,  325000.]))

In [97]:
features_matrix, output_array = get_numpy_data(train_data, model_features,output)

In [98]:
initial_weights = np.array([-100000., 1., 1.])

In [99]:
step_size = 4e-12

In [100]:
tolerance = 1e9

In [101]:
simple_weights=regression_gradient_descent(features_matrix,output_array,initial_weights,step_size,tolerance)

In [102]:
simple_weights

array([ -9.99999688e+04,   2.45072603e+02,   6.52795277e+01])

In [103]:
PredictedHousesbasedonModel=predict_outcome (features_matrix, simple_weights)

In [104]:
PredictedHousesbasedonModel

array([ 276660.26922685,  640159.02144474,  266266.25036689, ...,
        374838.79037369,  384160.32923145,  216559.20396617])

In [105]:
# Now do this for test data

In [106]:
features_matrix, output = get_numpy_data(test_data, model_features, my_output)

In [107]:
# Use the weights that we got from our train data to now predict the test data

In [108]:
PredictedHousesbasedonModel=predict_outcome (features_matrix, simple_weights)

In [109]:
# Computing the RSS for the test data
# RSS = sum of all the squared differences

In [110]:
RSS = sum((PredictedHousesbasedonModel - output) ** 2)

In [111]:
RSS

270263446465243.97