In [16]:
import pandas as pd
import numpy as np
from math import sqrt

In [5]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
Train = pd.read_csv(r'kc_house_train_data.csv.zip',dtype=dtype_dict)
Test = pd.read_csv(r'kc_house_test_data.csv.zip',dtype=dtype_dict)

In [9]:
def get_numpy_data(data,features,output):
    data['constant'] = 1
    features = ['constant'] + features
    feat_data = data[features]
    features_matrix = feat_data.values
    out = data[output]
    output_array = out.values
    return(features_matrix, output_array)

In [10]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [11]:
feat_mat,out = get_numpy_data(Train,['sqft_living'],'price') # Testing the first function.

In [12]:
weights = np.ones(2)

In [13]:
predict_outcome(feat_mat,weights)

array([1181., 2571.,  771., ..., 1531., 1601., 1021.])

In [14]:
def feature_derivative(errors, feature):
    derivative = 2*np.dot(errors,feature)
    return(derivative)

In [15]:
# testing the feature_derivative() function.
errors = [10,50,70]
feature = [3,5,8]
feature_derivative(errors,feature)

1680

In [48]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    weights = np.array(initial_weights)
    predictions = predict_outcome(feature_matrix=feature_matrix,weights=weights)
    conv = False
    while (not conv):
        predictions = predict_outcome(feature_matrix=feature_matrix,weights=weights)
        errors = predictions - output
        gradient_sum_squares = 0
        for i in range (len(weights)):
            derv = feature_derivative(errors=errors,feature=feature_matrix[:,i])
            weights[i] -= step_size * derv
            gradient_sum_squares += derv**2
        grad_mag = sqrt(gradient_sum_squares)
        if (grad_mag <tolerance):
            conv = True
    return (weights)

In [49]:
#testing regression_gradient_descent() function.
features = ['sqft_living']
output = 'price'
feat_mat , out = get_numpy_data(Train,features,output)
ini_weights = np.array([-47000.,1.])
step_size = 7e-12
tolerance = 2.5e7

In [50]:
sim_weights = regression_gradient_descent(feat_mat,out,ini_weights,step_size,tolerance)

In [51]:
sim_weights

array([-46999.88716555,    281.91211918])

In [54]:
# for testing regression_gradient_descent() model.
test_feat_mat , out = get_numpy_data(Test,features,output)
test_pred = predict_outcome(feature_matrix=test_feat_mat,weights=sim_weights) # predicted price for the 1st House is 356134.4 $

In [56]:
def RSS(predictions,output):
    error = predictions - output
    return (error*error).sum()

In [57]:
RSS(test_pred,Test['price'])

275400044902128.3

In [63]:
# for multiple Features.
features = ['sqft_living','sqft_living15']
weights = np.array([-100000.,1.,1.])
feat_mat , out = get_numpy_data(data=Train,features=features,output=output)
new_weights = regression_gradient_descent(feature_matrix=feat_mat,initial_weights=weights,output=out,step_size=4e-12,tolerance=1e9)

In [69]:
test_feat_mat , out = get_numpy_data(Test,features,output)
test_pred = predict_outcome(test_feat_mat,new_weights) #predicted price for the 1st House is 366651.41162949 $

In [73]:
RSS(predictions=test_pred,output=Test['price'])

270263443629803.56