In [6]:
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline

In [3]:
train_data=pd.read_csv('kc_house_train_data.csv')
test_data=pd.read_csv('kc_house_test_data.csv')

In [13]:
test_data.columns

Index([u'id', u'date', u'price', u'bedrooms', u'bathrooms', u'sqft_living',
       u'sqft_lot', u'floors', u'waterfront', u'view', u'condition', u'grade',
       u'sqft_above', u'sqft_basement', u'yr_built', u'yr_renovated',
       u'zipcode', u'lat', u'long', u'sqft_living15', u'sqft_lot15'],
      dtype='object')

In [16]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’

    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = np.array(data_sframe[features])
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’

    # this will convert the SArray into a numpy array:
    output_array = np.array(data_sframe[output]) # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [22]:
x,y=get_numpy_data(test_data,['sqft_living','bedrooms'],['price'])

In [23]:
def predict_outcome(feature_matrix, weights):
    predictions=np.dot(feature_matrix, weights)
    return(predictions)


In [27]:
my_weights = np.array([1., 1.,1.]) # the example weights
pred_example=predict_outcome(x,my_weights)
print pred_example

[ 1434.  2955.  1714. ...,  2525.  2315.  1023.]


In [39]:
def feature_derivative(errors, feature):
    derivative=2*np.dot(errors,feature)
    return(derivative)

In [40]:
from math import sqrt

In [48]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):

    converged = False #feature_matrix, output np array
    weights = np.array(initial_weights) # make sure it's a numpy array
    while not converged:

        # compute the predictions based on feature_matrix and weights H and w
        predictions=predict_outcome(feature_matrix, weights)
       
        # compute the errors as predictions - output
        errors=output-predictions

        gradient_sum_squares = 0 # initialize the gradient sum of squares

        # while we haven't reached the tolerance yet, update each feature's weight

        for i in range(len(weights)): # loop over each weight
             # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            der_w=feature_derivative(errors, feature_matrix[:,i])
            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)
            gradient_sum_squares=gradient_sum_squares+(der_w**2)
            # subtract the step size times the derivative from the current weight
            weights[i]=weights[i]+step_size*der_w
        # compute the square-root of the gradient sum of squares to get the gradient matnigude:
        gradient_magnitude = sqrt(gradient_sum_squares)

        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [49]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [50]:
real_weigths=regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
print real_weigths #weights

[-46999.88716555    281.91211918]


In [51]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [53]:
test_predictions_1 = predict_outcome(test_simple_feature_matrix,np.array(real_weigths))

In [55]:
print test_predictions_1[0] #predictiions value for first house

answer 356134.443255


In [66]:
rss=((test_output-test_predictions_1)**2).sum()
print rss

2.75400044902e+14


In [57]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [60]:
weight_t2=regression_gradient_descent(feature_matrix, output,initial_weights ,step_size, tolerance)
print weight_t2

[ -9.99999688e+04   2.45072603e+02   6.52795267e+01]


In [61]:
(test_simple_feature_matrix_2, test_output_2) = get_numpy_data(test_data, model_features, my_output)
test_predictions_2 = predict_outcome(test_simple_feature_matrix_2,np.array(weight_t2))

In [64]:
print test_predictions_2[0],test_predictions_1[0],test_output[0]

 366651.411629 356134.443255 310000.0


In [67]:
rss2=((test_output-test_predictions_2)**2).sum()
print rss2

2.7026344363e+14


In [72]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
clf.fit(train_data[['sqft_living','sqft_living15']],train_data[['price']])
predictions=clf.predict(test_data[['sqft_living','sqft_living15']])
error=test_data['price'].values-predictions
rss=(error**2).sum()
print 'rss',rss
print predictions[0]

rss 3.45504260882e+18
[ 366541.10816718]
