In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 
              'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype = dtype_dict)

### 1.  Set Up Feature_Matrix and Output 

In [4]:
def get_numpy_data(data_sframe, features, output):
    
    data_sframe['constant'] = 1 
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    
    features_matrix = data_sframe[features]
   
    output_array = data_sframe[output]
    
    return(features_matrix, output_array)

### 2. Use Feature Matrix & Weights dot product to Calculate Predictions

In [5]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

### 3. Derivatives of feature j = 2 * (feature  j, error)

In [6]:
def feature_derivative(errors, feature):
    derivative = 2*np.dot(feature, errors)
    return(derivative)

In [7]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = initial_weights
    coln = feature_matrix.columns.values
    while not converged:
       
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
         
            derivative = feature_derivative(errors,feature_matrix[coln[i]])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = gradient_sum_squares+derivative**2
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size*derivative
            
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)


In [8]:
train_data = pd.read_csv('kc_house_train_data.csv',dtype = dtype_dict)

In [9]:
test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [10]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [11]:
simple_weights = regression_gradient_descent(simple_feature_matrix, 
                                             output,initial_weights, step_size,tolerance)

In [12]:
simple_weights

array([-46999.88716555,    281.91211918])

In [13]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [14]:
test_predictions = predict_outcome(test_simple_feature_matrix,simple_weights)

In [57]:
print(test_predictions[0] - test_output[0])
print(test_predictions[0])

46134.44325500238
356134.4432550024


In [42]:
test_RSS = (test_predictions-test_output)**2
sum(test_RSS)

275400044902128.78

In [34]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [35]:
weights = regression_gradient_descent(feature_matrix, 
                                             output,initial_weights, step_size,tolerance)

In [49]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

In [50]:
test_predictions1 = predict_outcome(test_feature_matrix,weights)

In [58]:
test_predictions1[0] - test_output[0]
test_predictions1[0]

366651.4116294939

In [52]:
test_RSS1 = (test_predictions1-test_output)**2
sum(test_RSS1)

270263443629803.3

In [54]:
sum(test_RSS1) > sum(test_RSS)

False

In [55]:
test_predictions1[0] - test_output[0] > test_predictions[0] - test_output[0]

True