In [24]:
import numpy as np 
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
sales = pd.read_csv('kc_house_data.csv')
#sales.dtypes

In [3]:
sales['id'] = sales['id'].astype(str)
sales['date'] = sales['date'].astype(str)
sales['bedrooms'] = sales['bedrooms'].astype(float)
#sales['sqft_living'] = sales['sqft_living'].astype(float)
sales['floors'] = sales['floors'].astype(str)
sales['zipcode'] = sales['zipcode'].astype(str)
sales['sqft_living15'] = sales['sqft_living15'].astype(float)
sales['sqft_lot15'] = sales['sqft_lot15'].astype(float)

In [4]:
sales.isnull().values.any()

False

# Exploring the data

### Split the data

In [5]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [6]:
train_data,test_data = train_test_split(sales, test_size =.2)

In [7]:
prices = sales['price']
sum_prices = sum(prices)
num_prices = len(prices)
avr_prices1 = sum_prices/num_prices
avr_prices2 = prices.mean()
print("average price via method 1: "+str(avr_prices1))
print("average price via method 2: "+str(avr_prices2))

average price via method 1: 540088.1417665294
average price via method 2: 540088.1417665294


# Simple Linear Regression using a closed form solution

In [56]:
#Calculating the slope and the intercept for the linear regression function
def SLNF(x,y):
    n = len(x)
    numerator_1 = (n*sum(x*y))-(sum(x)*sum(y))
    denominator_1 = (n*sum(x^2))-(sum(x))^2
    slope = numerator_1/denominator_1
    #intercept = y.mean()-(slope*x.mean())
    numerator_2 = (sum(y)*sum(x^2))- (sum(x)*sum(x*y))
    intercept = numerator_2/denominator_1
    return (slope,intercept)

In [57]:
(train_data_slope, train_data_intercept) = SLNF(train_data['sqft_living'], train_data['price'])
print('Slope: '+str(train_data_slope))
print('Intercept: '+str(train_data_intercept))

Slope: 114354.4789671526
Intercept: -1361123122.2051513


In [63]:
def slnf(x,y):
    n = len(x)
    s_x = sum(x)
    s_y = sum(y)
    product = x*y
    s_product = sum(product)
    sq_x = x^2
    s_sq_x = sum(sq_x)
    sq_y = y*y
    s_sq_y = sum(sq_y)
    slope = (n*s_product - (s_x*s_y))/(n*s_sq_x - s_x^2)
    intercept = ((s_y*s_sq_x)- (s_x*s_product)) / (n*s_sq_x - s_x^2)
    #intercept = y.mean() - (slope*(x.mean()))
    return (slope, intercept)

In [64]:
(data_slope, data_intercept) = slnf(train_data['sqft_living'], train_data['price'])
print('Slope: '+str(data_slope))
print('Intercept: '+str(data_intercept))

Slope: 114354.4789671526
Intercept: -1361123122.2051513


## Predicting new values

In [58]:
def get_regression_predictions(x, intercept, slope):
    predicted_values = intercept + (slope*x)
    return predicted_values

In [59]:
predicted_values = get_regression_predictions(test_data['sqft_living'], train_data_intercept, train_data_slope)

In [60]:
predicted_values, test_data['price']

(10967   -1.084385e+09
 18286   -1.130127e+09
 6942    -1.072950e+09
 14350   -1.091247e+09
 11293   -1.172438e+09
 6120    -1.278788e+09
 10055   -1.147280e+09
 14636   -1.161003e+09
 14037   -1.019203e+09
 1237    -1.172438e+09
 10780   -1.096964e+09
 11286   -1.198740e+09
 12387   -8.884961e+08
 15991   -1.189591e+09
 21500   -1.083242e+09
 4829    -1.269640e+09
 17798   -1.225041e+09
 7186    -1.239907e+09
 17868   -1.245625e+09
 182     -1.082098e+09
 4731    -1.126696e+09
 399     -1.186161e+09
 11433   -1.191878e+09
 6326    -1.106113e+09
 7069    -1.074093e+09
 14777   -1.092390e+09
 2924    -1.218180e+09
 1028    -1.190735e+09
 1591    -1.198740e+09
 4865    -1.204457e+09
              ...     
 11498   -9.254326e+08
 2239    -1.195309e+09
 17991   -1.215893e+09
 4587    -1.107256e+09
 13377   -1.130127e+09
 19857   -1.006624e+09
 11312   -1.245625e+09
 13167   -1.138132e+09
 19137   -1.156429e+09
 637     -1.205601e+09
 5005    -9.929017e+08
 2883    -1.227328e+09
 8837    -1

In [22]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, train_data_intercept, train_data_slope)
print ("The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price))

The estimated price for a house with 2650 squarefeet is $65669091.28


##  Compute Residual Sum of Squares

In [43]:
def rss(y, x, intercept, slope):
    predicted_values = intercept + (slope*x)
    R = (y-predicted_values)
    RS = R*R
    RSS = sum(RS)
    return RSS   

In [44]:
RSS = rss(test_data['price'], test_data['sqft_living'], train_data_intercept, train_data_slope)
print('RSS: '+str(RSS))

RSS: 1394373032.4196465
