In [None]:
import pandas as pd
import math 
import sklearn
from sklearn import linear_model
import numpy as np

In [None]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
salesDf = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [None]:
def get_numpy_data(data_dframe, features, output):
    data_dframe['constant'] = 1
    features = ['constant'] + features
    features_dframe = data_dframe[features]
    feature_matrix = features_dframe.to_numpy()
    output_darray = data_dframe[output] 
    output_array = output_darray.to_numpy()
    return(feature_matrix, output_array)

In [None]:
(example_features, example_output) = get_numpy_data(salesDf, ['sqft_living'], 'price')
print (example_features[0,:])
print (example_output[0])

 # Example code

In [14]:
my_weights = np.array([1., 1.])
my_features = example_features[0,]
predicted_value = np.dot(my_features, my_weights)
print (predicted_value)

1181.0


# Continue

In [28]:
def predict_output(feature_matrix, weights):
    predicted_value = np.dot(feature_matrix, weights)
    return (predicted_value)

In [29]:
test_predictions = predict_output(example_features, my_weights)
print (test_predictions[0]) # should be 1181.0
print (test_predictions[1]) # should be 2571.0

1181.0
2571.0


In [36]:
def feature_derivative(errors, feature):
    derivative = 2 * errors * feature
    return(sum(derivative))

In [50]:
(example_features, example_output) = get_numpy_data(salesDf, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
errors = test_predictions - example_output 
feature = example_features[:,0] 
derivative = feature_derivative(errors, feature)
print (derivative)
print (-np.sum(example_output)*2)
example_features[:,0]


-23345850016.0
-23345850016.0


array([1., 1., 1., ..., 1., 1., 1.])

# gradient Descent

In [39]:
from math import sqrt

In [61]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights)
    while not converged:
        predictions = predict_output(feature_matrix,weights)
        errors = predictions - output
        gradient_sum_squares = 0
        for i in range(len(weights)):
            derivative = feature_derivative(errors,feature_matrix[:, i])
            gradient_sum_squares = gradient_sum_squares + (derivative**2)
            weights[i] = weights[i] - (step_size * derivative)
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)    

In [55]:
train_data = salesDf.sample(frac=0.80, random_state=0)
test_data = salesDf.drop(train_set.index)

# simple regression

In [62]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

KeyboardInterrupt: 