In [1]:
import numpy as np 
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
sales = pd.read_csv('kc_house_data.csv')
#sales.dtypes

In [3]:
sales['id'] = sales['id'].astype(str)
sales['date'] = sales['date'].astype(str)
sales['bedrooms'] = sales['bedrooms'].astype(float)
#sales['sqft_living'] = sales['sqft_living'].astype(float)
sales['floors'] = sales['floors'].astype(str)
sales['zipcode'] = sales['zipcode'].astype(str)
sales['sqft_living15'] = sales['sqft_living15'].astype(float)
sales['sqft_lot15'] = sales['sqft_lot15'].astype(float)

In [4]:
sales.isnull().values.any()

False

# Exploring the data

### Split the data

In [5]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [6]:
train_data,test_data = train_test_split(sales, test_size =.2)

In [7]:
prices = sales['price']
sum_prices = sum(prices)
num_prices = len(prices)
avr_prices1 = sum_prices/num_prices
avr_prices2 = prices.mean()
print("average price via method 1: "+str(avr_prices1))
print("average price via method 2: "+str(avr_prices2))

average price via method 1: 540088.1417665294
average price via method 2: 540088.1417665294


# Simple Linear Regression using a closed form solution

In [21]:
#Calculating the slope and the intercept for the linear regression function
#def SLNF(x,y):
   # n = len(x)
   # numerator_1 = (n*sum(x*y))-(sum(x)*sum(y))
   # denominator_1 = (n*sum(x^2))-(sum(x))^2
    #slope = numerator_1/denominator_1
    #intercept = y.mean()-(slope*x.mean())
   # numerator_2 = (sum(y)*sum(x^2))- (sum(x)*sum(x*y))
   # intercept = numerator_2/denominator_1
   # return (slope,intercept)

In [26]:
#Calculating the slope and the intercept for the linear regression function
def SLNF(x,y):
    n = len(x)
    sum_y = sum(y)
    sum_x = sum(x)
    sum_product = sum(x*y)
    sum_x_s = sum(x*x)
    #slope
    numerator = sum_product - ((1/n)*(sum_x*sum_y))
    denominator = sum_x_s - ((1/n)*(sum_x*sum_x))
    slope = numerator/denominator
    #intercept
    intercept = y.mean() - (slope*x.mean())
    return (slope,intercept)

In [27]:
(train_data_slope, train_data_intercept) = SLNF(train_data['sqft_living'], train_data['price'])
print('Slope: '+str(train_data_slope))
print('Intercept: '+str(train_data_intercept))

Slope: 278.00974590283397
Intercept: -39495.83906857821


In [24]:
#def slnf(x,y):
    #n = len(x)
   # s_x = sum(x)
    #s_y = sum(y)
   # product = x*y
    #s_product = sum(product)
    #sq_x = x^2
    #s_sq_x = sum(sq_x)
    #sq_y = y*y
    #s_sq_y = sum(sq_y)
   # slope = (n*s_product - (s_x*s_y))/(n*s_sq_x - s_x^2)
    #intercept = ((s_y*s_sq_x)- (s_x*s_product)) / (n*s_sq_x - s_x^2)
    #intercept = y.mean() - (slope*(x.mean()))
    #return (slope, intercept)

In [25]:
#(data_slope, data_intercept) = slnf(train_data['sqft_living'], train_data['price'])
#print('Slope: '+str(data_slope))
#print('Intercept: '+str(data_intercept))

## Predicting new values

In [28]:
def get_regression_predictions(x, intercept, slope):
    predicted_values = intercept + (slope*x)
    return predicted_values

In [29]:
predicted_values = get_regression_predictions(test_data['sqft_living'], train_data_intercept, train_data_slope)

In [30]:
predicted_values, test_data['price']

(16930    480382.385770
 16051    274655.173802
 10651    335817.317900
 19694    605486.771426
 15379    485942.580688
 21481    447021.216261
 4722     452581.411179
 9982     772292.618968
 13550    919637.784296
 3577     524863.945114
 5129     905737.297001
 9416     396979.461999
 12751    449801.313720
 10875    483162.483229
 7394     869596.030034
 14536    352497.902654
 15942    502623.165442
 14135    366398.389950
 13111    711130.474869
 10394    647188.233311
 8264     282995.466179
 12347    452581.411179
 2481     555445.017163
 294      508183.360360
 7411     533204.237491
 16570    294115.856015
 4571     366398.389950
 7826     380298.877245
 460      919637.784296
 15055    544324.627327
              ...      
 8076     583245.991754
 1062     750051.839295
 17570    997480.513149
 20988    841795.055443
 15212    260754.686506
 2472     380298.877245
 17867    227393.516998
 15525    472042.093393
 18898    466481.898475
 16519    410879.949294
 5646     399759

In [31]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, train_data_intercept, train_data_slope)
print ("The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price))

The estimated price for a house with 2650 squarefeet is $697229.99


##  Compute Residual Sum of Squares

In [32]:
def rss(y, x, intercept, slope):
    predicted_values = intercept + (slope*x)
    R = (y-predicted_values)
    RS = R*R
    RSS = sum(RS)
    return RSS   

In [33]:
RSS = rss(test_data['price'], test_data['sqft_living'], train_data_intercept, train_data_slope)
print('RSS: '+str(RSS))

RSS: 307250339646915.2


2359124448.0

2359124448.0