Import Libraries Required

In [1]:
import pandas as pd
import numpy as np

Import Train and Test Data

In [5]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')
print(train_data.head())
print(test_data.head())

           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view     ...      grade  sqft_above  \
0      5650     1.0           0     0     ...          7        1180   
1      7242     2.0           0     0     ...          7        2170   
2     10000     1.0           0     0     ...          6         770   
3      5000     1.0           0     0     ...          7        1050   
4      8080     1.0           0     0     ...          8        1680   

   sqft_basement  yr_built  yr_renovated  zipcode      lat     long  \
0              0      1955 

Function to create required feature matrices

In [16]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column - for intercept term
    
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = np.array(data.loc[:,features])
 

    # this will convert the SArray into a numpy array:
    output_array = np.array(data.loc[:,output]) 
    return(features_matrix, output_array)


Derivative Function

In [56]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(feature, errors)
    return(derivative)

Prediction

In [19]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

Grdaient Descent

In [83]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        prediction = predict_outcome(feature_matrix, weights)
        error = prediction - output
        
        # initialize the gradient
        gradient_sum_squares = 0 
        
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            derivative = feature_derivative(error, feature_matrix[:,i])
            
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = gradient_sum_squares + (derivative ** 2)
            
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - (step_size * derivative)
            
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)


Execution

In [174]:
### Model1 Parameters

features_cols = ['sqft_living']
output_cols = 'price'
initial_weights = [-47000., 1.]
step_size = 7e-12
tolerance = 2.5e7

In [172]:
(feature_matrix, output) = get_numpy_data(train_data, features_cols, output_cols)

print(feature_matrix)
print(output)

[[   1 1180]
 [   1 2570]
 [   1  770]
 ..., 
 [   1 1530]
 [   1 1600]
 [   1 1020]]
[ 221900.  538000.  180000. ...,  360000.  400000.  325000.]


Qn 1

In [175]:
simple_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

print(simple_weights)

[-46999.88716555    281.91211918]


Qn 2

In [176]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, features_cols, output_cols)

print(test_feature_matrix)
print(test_output)

[[   1 1430]
 [   1 2950]
 [   1 1710]
 ..., 
 [   1 2520]
 [   1 2310]
 [   1 1020]]
[ 310000.  650000.  233000. ...,  610685.  400000.  402101.]


In [179]:
test_prediction = predict_outcome(test_feature_matrix, simple_weights)
test_prediction

array([ 356134.443255  ,  784640.86440132,  435069.83662406, ...,
        663418.65315598,  604217.10812919,  240550.47439317])

In [180]:
print(test_prediction[0])   ### Predicted Price of first house

356134.443255


In [202]:
print(test_output[0])       ### Actual Price of first house

310000.0


Model 1 - RSS

In [190]:
np.sum((test_prediction - test_output)**2)/10**14

2.7540004490212833

Qn 3

In [193]:
### Model2 Parameters

model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [192]:
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)

print(feature_matrix)
print(output)

[[   1 1180 1340]
 [   1 2570 1690]
 [   1  770 2720]
 ..., 
 [   1 1530 1530]
 [   1 1600 1410]
 [   1 1020 1020]]
[ 221900.  538000.  180000. ...,  360000.  400000.  325000.]


In [194]:
simple_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

print(simple_weights)

[ -9.99999688e+04   2.45072603e+02   6.52795267e+01]


In [198]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

print(test_feature_matrix)
print(test_output)

[[   1 1430 1780]
 [   1 2950 2140]
 [   1 1710 1030]
 ..., 
 [   1 2520 2520]
 [   1 2310 1830]
 [   1 1020 1020]]
[ 310000.  650000.  233000. ...,  610685.  400000.  402101.]


In [199]:
test_prediction = predict_outcome(test_feature_matrix, simple_weights)
test_prediction

array([ 366651.41162949,  762662.39850726,  386312.09557541, ...,
        682087.39916306,  585579.27901327,  216559.20391786])

In [200]:
print(test_prediction[0])   ### Predicted Price of first house

366651.411629


In [202]:
print(test_output[0])       ### Actual Price of first house

310000.0


Model 2 - RSS

In [201]:
np.sum((test_prediction - test_output)**2)/10**14

2.7026344362980357

Qn 4 - Model1

Qn 5 - Model2