In [1]:
import numpy as np 
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
sales = pd.read_csv('kc_house_data.csv')
#sales.dtypes

In [3]:
sales['id'] = sales['id'].astype(str)
sales['date'] = sales['date'].astype(str)
#sales['bedrooms'] = sales['bedrooms'].astype(float)
#sales['sqft_living'] = sales['sqft_living'].astype(float)
sales['floors'] = sales['floors'].astype(str)
sales['zipcode'] = sales['zipcode'].astype(str)
sales['sqft_living15'] = sales['sqft_living15'].astype(float)
sales['sqft_lot15'] = sales['sqft_lot15'].astype(float)

In [4]:
sales.isnull().values.any()

False

### Split the data

In [5]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [6]:
train_data,test_data = train_test_split(sales, test_size =.2)

In [7]:
prices = sales['price']
sum_prices = sum(prices)
num_prices = len(prices)
avr_prices1 = sum_prices/num_prices
avr_prices2 = prices.mean()
print("average price via method 1: "+str(avr_prices1))
print("average price via method 2: "+str(avr_prices2))

average price via method 1: 540088.1417665294
average price via method 2: 540088.1417665294


# Simple Linear Regression using a closed form solution

In [8]:
#Calculating the slope and the intercept for the linear regression function
def SLNF(x,y):
    n = len(x)
    sum_y = sum(y)
    sum_x = sum(x)
    sum_product = sum(x*y)
    sum_x_s = sum(x*x)
    #slope
    numerator = sum_product - ((1/n)*(sum_x*sum_y))
    denominator = sum_x_s - ((1/n)*(sum_x*sum_x))
    slope = numerator/denominator
    #intercept
    intercept = y.mean() - (slope*x.mean())
    return (slope,intercept)

In [9]:
(train_data_slope, train_data_intercept) = SLNF(train_data['sqft_living'], train_data['price'])
print('Slope: '+str(train_data_slope))
print('Intercept: '+str(train_data_intercept))

Slope: 279.027782150836
Intercept: -40362.34649145976


In [10]:
#Test the SLNF function
test_input = np.array(range(10))
test_output = np.array(1+(1*test_input))
(test_slope, test_intercept) = SLNF(test_input, test_output)
print('Slope: '+str(test_slope))
print('Intercept: '+str(test_intercept))
    

Slope: 1.0
Intercept: 1.0


## Predicting new values

In [11]:
def get_regression_predictions(x, intercept, slope):
    predicted_values = intercept + slope*x
    return predicted_values

In [12]:
predicted_values = get_regression_predictions(test_data['sqft_living'], train_data_intercept, train_data_slope)

In [13]:
#predicted_values, test_data['price']

In [14]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, train_data_intercept, train_data_slope)
print ("The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price))

The estimated price for a house with 2650 squarefeet is $699061.28


##  Compute Residual Sum of Squares

In [15]:
def rss(y, x):
    (slope, intercept) = SLNF(x,y)
    predicted_values = get_regression_predictions(x,intercept,slope)
    R = (y-predicted_values)
    RS = R*R
    RSS = sum(RS)
    return RSS

In [16]:
#test rss function
test_rss = rss(test_output, test_input)
test_rss

0.0

In [18]:
RSS_sqft = rss(test_data['price'], test_data['sqft_living'])
print('RSS: '+str(RSS_sqft))

RSS: 303874979545411.44


In [19]:
# Inverse regression function
def inverse_regression(price,x,y):
    (slope, intercept) = SLNF(x,y)
    sqft = (price - intercept)/slope
    return sqft   

In [20]:
print('The stimated square feet for a house costing $800,000 is: '+str(inverse_regression(800000,train_data['sqft_living'],
                                                                                          train_data['price']))+' sqft')

The stimated square feet for a house costing $800,000 is: 3011.7515181236654 sqft


In [21]:
#use number of bedrooms instead of the sqf
RSS_bedrooms = rss(test_data['price'], test_data['bedrooms'])
RSS_bedrooms

555319075123512.75