In [1]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [3]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)

In [4]:
test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [37]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 
    
    features = ['constant'] + features
    features_matrix = data_sframe[features].values
    output_array = data_sframe[output].values
    return features_matrix, output_array
(example_features, example_output) = get_numpy_data(train_data, ['sqft_living'], 'price')
print example_features[0,:] # this accesses the first row of the data the ':' indicates 'all columns'
print example_output[0]

[  1.00000000e+00   1.18000000e+03]
221900.0


In [11]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return predictions

In [12]:
predict_outcome([[1,2],[3,4]], [1,2])

array([ 5, 11])

In [28]:
def feature_derivative(errors, feature):
    derivative = 2*np.dot(errors, feature)
    return derivative

In [38]:
(example_features, example_output) = get_numpy_data(train_data, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_outcome(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print derivative
print -np.sum(example_output)*2 # should be the same as derivative
from math import sqrt

-18752698920.0
-18752698920.0


In [39]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        # compute the errors as predictions - output:
        predictions = predict_outcome(feature_matrix, weights)
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:,i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares+=derivative**2
            # update the weight based on step size and derivative:
            weights[i] -= (step_size * derivative)
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)


In [40]:
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [41]:
test_weight = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
print test_weight

[-46999.88716555    281.91211918]


In [44]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [46]:
test_predictions = predict_outcome(test_simple_feature_matrix, test_weight)
print test_predictions

[ 356134.443255    784640.86440132  435069.83662406 ...,  663418.65315598
  604217.10812919  240550.47439317]


In [47]:
residue = test_predictions - test_data['price']

In [50]:
(residue**2).sum()

275400044902128.78

In [51]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [52]:
weight_2 = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)
print weight_2

[ -9.99999688e+04   2.45072603e+02   6.52795267e+01]


In [53]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

In [54]:
test_predictions = predict_outcome(test_simple_feature_matrix, weight_2)
print test_predictions

[ 366651.41162949  762662.39850726  386312.09557541 ...,  682087.39916306
  585579.27901327  216559.20391786]


In [55]:
test_data['price'].head()

0    310000.0
1    650000.0
2    233000.0
3    580500.0
4    535000.0
Name: price, dtype: float64

In [56]:
residue = test_predictions - test_data['price']
(residue**2).sum()

270263443629803.3