Importing Libraries

In [1]:
import pandas as pd
import numpy as np

Load Data

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

print(train_data.head())
print(test_data.head())

           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0       3.0       1.00       1180.0   
1  6414100192  20141209T000000  538000.0       3.0       2.25       2570.0   
2  5631500400  20150225T000000  180000.0       2.0       1.00        770.0   
3  2487200875  20141209T000000  604000.0       4.0       3.00       1960.0   
4  1954400510  20150218T000000  510000.0       3.0       2.00       1680.0   

   sqft_lot floors  waterfront  view     ...      grade  sqft_above  \
0      5650      1           0     0     ...          7        1180   
1      7242      2           0     0     ...          7        2170   
2     10000      1           0     0     ...          6         770   
3      5000      1           0     0     ...          7        1050   
4      8080      1           0     0     ...          8        1680   

   sqft_basement  yr_built  yr_renovated  zipcode      lat     long  \
0              0      1955       

Built Feature Matrices

In [4]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1 # add a constant column to an SFrame
    
    features = ['constant'] + features
    
    # select the columns of dataframe given by the ‘features’ list
    features_frame = data_frame.loc[:,features]

    # this will convert the Dataframe into a numpy matrix
    features_matrix = np.array(features_frame)
    
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_array = data_frame[output]
    
    # this will convert the Series into a numpy array:
    output_array = np.array(output_array )
    
    return(features_matrix, output_array)

Predict Function

In [5]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

Derivative Function

In [16]:
def feature_derivative_ridge(errors, feature, weight, l2_penalty, feature_is_constant):
    
    if not feature_is_constant:
            
        derivative = 2*np.sum(errors * feature) + 2*l2_penalty*weight
    else:
        
        derivative = 2*np.sum(errors * feature)
        
    return derivative

Testing Derivative Function

In [7]:
(example_features, example_output) = get_numpy_data(train_data, ['sqft_living'], 'price')
my_weights = np.array([1., 10.])
test_predictions = predict_outcome(example_features, my_weights)
errors = test_predictions - example_output # prediction errors

In [17]:
# next two lines should print the same values
print(feature_derivative_ridge(errors, example_features[:,1], my_weights[1], 1, False))
print(np.sum(errors*example_features[:,1])*2+20.)
print('')

-45532892335518.0
-45532892335518.0



In [18]:
# next two lines should print the same values
print(feature_derivative_ridge(errors, example_features[:,0], my_weights[0], 1, True))
print(np.sum(errors)*2.)

-18029479492.0
-18029479492.0


Gradient Descent

In [10]:
def ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations=100):
    
    weights = np.array(initial_weights) # make sure it's a numpy array
    iterations = 0
    
    while iterations < max_iterations:
        predict = predict_outcome(feature_matrix, weights)
        error = predict - output

        # compute the errors as predictions - output
        for i in range(len(weights)):

            if i == 0:
                derivative = feature_derivative_ridge(error, feature_matrix[:,i], weights[i], l2_penalty, True)
            else:
                derivative = feature_derivative_ridge(error, feature_matrix[:,i], weights[i], l2_penalty, False)

            weights[i] = weights[i] - step_size * derivative

        iterations = iterations + 1
            
    return weights

Qn 1

In [20]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
(simple_test_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [29]:
step_size = 1e-12
max_iterations = 1000
initial_weights = [0.,0.]
l2_penalty=0

In [30]:
weight_low_penalty = ridge_regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations)

print(weight_low_penalty)

[-1.63113515e-01  2.63024369e+02]


Qn 2

In [31]:
l2_penalty=1e11

In [32]:
weight_high_penalty = ridge_regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations)

print(weight_high_penalty)

[  9.76730382 124.57217567]


Qn 3 - Model with low penalty

Qn 4

In [39]:
test_predict1 = predict_outcome(simple_test_feature_matrix, [9.76730382, 124.57217567])
test_predict1

array([178147.97851192, 367497.68553032, 213028.18769952, ...,
       313931.64999222, 287771.49310152, 127073.38648722])

In [40]:
rss = np.sum((test_predict1 - test_output) ** 2)
rss/10e13

6.946421015148961

In [65]:
rss = np.sum((predict_outcome(simple_test_feature_matrix, [-1.63113515e-01, 2.63024369e+02]) - test_output) ** 2)
rss/10e13

2.757236321531056

In [67]:
rss = np.sum((predict_outcome(simple_test_feature_matrix, [0., 0.]) - test_output) ** 2)
rss/10e14

1.784273286136298

Qn 5

In [54]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

In [56]:
initial_weights = [0.,0.,0.]
step_size = 1e-12
max_iterations = 1000
l2_penalty = 0.0

In [57]:
weight_low_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations)

print(weight_low_penalty)

[ -0.35743483 243.05416982  22.41481497]


Qn 6

In [58]:
l2_penalty = 1e11

In [59]:
weight_high_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations)

print(weight_high_penalty)

[ 6.74296579 91.48927365 78.43658766]


Qn 7

In [60]:
test_predict2 = predict_outcome(test_feature_matrix, [6.74296579, 91.48927365, 78.43658766])
test_predict2

array([270453.53032009, 437754.39782569, 237243.08619709, ...,
       428219.91346699, 354885.92051509, 173331.12150199])

In [61]:
rss = np.sum((test_predict2 - test_output) ** 2)
rss/10e13

5.00404800501054

In [62]:
rss = np.sum((predict_outcome(test_feature_matrix, [-0.35743483, 243.05416982, 22.41481497]) - test_output) ** 2)
rss/10e13

2.740676159186628

In [64]:
rss = np.sum((predict_outcome(test_feature_matrix, [0., 0., 0.]) - test_output) ** 2)
rss/10e14

1.784273286136298

Qn 8

In [46]:
test_predict3 = predict_outcome(test_feature_matrix, [0,172,96])
test_predict3[0]

416840.0

In [47]:
test_predict2[0]

267540.0

In [48]:
test_output[0] ### Model with high penalty predicts better

310000.0