In [1]:
import turicreate as tc
import numpy as np

In [4]:
sales = tc.SFrame("home_data.sframe")

### numpy function to get the data in matrix form

In [5]:
def get_numpy_data(data_sframe,features,output):
    data_sframe['constant'] = 1
    features = ['constant'] + features
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.to_numpy()
    output_sarray = data_sframe[output]
    output_array = output_sarray.to_numpy()
    return (feature_matrix,output_array)

### function to predict output

In [19]:
def predict_output(feature_matrix,weights):
    predictions = np.dot(feature_matrix,weights)
    return (predictions)

### the feature derivative function

In [20]:
def feature_derivative(error,feature):
    derivative = np.dot(error,feature)*2
    return (derivative)

### gradient descent for regression

In [21]:
def GDreg(feature_matrix,output,initial_wts,step_size,tolerance):
    converged = False
    wts = np.array(initial_wts)
    while not converged:
        predictions = predict_output(feature_matrix,wts)
        error = predictions - output
        gradient_sum_squares = 0
        for i in range (len(wts)):
            derivative = feature_derivative(error,feature_matrix[:,i])
            gradient_sum_squares += derivative**2
            wts[i] = wts[i] - step_size*derivative
        gradient_magnitude = (gradient_sum_squares)**0.5
        if gradient_magnitude < tolerance :
            converged = True
    return wts

 ### running gradient descent as simple regression

In [22]:
train_data,test_data = sales.random_split(.8,seed=0)

In [23]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [26]:
simple_wts = GDreg(simple_feature_matrix,output,initial_weights,step_size,tolerance)
simple_wts

array([-46999.88716555,    281.91211912])

In [28]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
predictions_test_data = predict_output(test_simple_feature_matrix,simple_wts)
predictions_test_data

array([356134.44317093, 784640.86422788, 435069.83652353, ...,
       663418.65300782, 604217.10799338, 240550.4743332 ])

In [31]:
rss_simple = np.dot(predictions_test_data - test_output, predictions_test_data - test_output)
rss_simple

275400047593155.94

### running a multiple regression

In [32]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [33]:
multiple_wts = GDreg(feature_matrix,output,initial_weights,step_size,tolerance)
multiple_wts

array([-9.99999688e+04,  2.45072603e+02,  6.52795277e+01])

In [34]:
(test_multiple_feature_matrix, test_multiple_output) = get_numpy_data(test_data, model_features, my_output)
predictions_multiple_test_data = predict_output(test_multiple_feature_matrix,multiple_wts)
predictions_multiple_test_data

array([366651.41203656, 762662.39786164, 386312.09499712, ...,
       682087.39928241, 585579.27865729, 216559.20396617])

In [35]:
test_data['price'][0]

310000.0

In [36]:
rss_multiple = np.dot(predictions_multiple_test_data - test_multiple_output, predictions_multiple_test_data - test_multiple_output)
rss_multiple

270263446465244.1

In [38]:
min(rss_simple,rss_multiple)

270263446465244.1