In [8]:
import numpy as np
import pandas as pd
from math import log

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [5]:
sales = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
test = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [6]:
sales['bedrooms_squared'] = sales['bedrooms']*sales['bedrooms']

In [7]:
sales['bed_bath_rooms'] = sales['bedrooms']*sales['bathrooms']

In [9]:
sales['log_sqft_living'] = sales['sqft_living'].apply(lambda x : log(x))

In [10]:
sales['lat_plus_long'] = sales['lat']+sales['long']

In [11]:
test['bedrooms_squared'] = test['bedrooms']*test['bedrooms']
test['bed_bath_rooms'] = test['bedrooms']*test['bathrooms']
test['log_sqft_living'] = test['sqft_living'].apply(lambda x : log(x))
test['lat_plus_long'] = test['lat']+test['long']

In [12]:
test['bedrooms_squared'].mean()

12.4466777015843

In [13]:
test['bed_bath_rooms'].mean()

7.5039016315913925

In [14]:
test['log_sqft_living'] .mean()

7.550274679645921

In [15]:
test['lat_plus_long'].mean()

-74.65333355403185

In [16]:
model1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']

In [17]:
model2_features = ['sqft_living', 'bedrooms', 'bathrooms','lat','long', 'bed_bath_rooms']

In [19]:
model3_features = model2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
model1 = LinearRegression()
model1.fit(sales[model1_features], sales['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
model2 = LinearRegression()
model2.fit(sales[model2_features], sales['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [25]:
model3 = LinearRegression()
model3.fit(sales[model3_features], sales['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
model1.coef_[2]

15706.742082734645

In [28]:
model1.coef_

array([  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
         6.58619264e+05,  -3.09374351e+05])

In [29]:
model2.coef_

array([  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
         6.54844630e+05,  -2.94298969e+05,   2.55796520e+04])

In [30]:
model1.residues_



967879963049545.62

In [31]:
model2.residues_



958419635074068.87

In [32]:
model3.residues_



array([], dtype=float64)

In [33]:
model3.coef_

array([  5.29422820e+02,   3.45142296e+04,   6.70607813e+04,
         5.34085611e+05,  -4.06750711e+05,  -8.57050439e+03,
        -6.78858667e+03,  -5.61831484e+05,   1.27334900e+05])

In [35]:
def get_residual_sum_of_squares(model, data, outcome):
    
    predictions = model.predict(data)
    
    residuals = outcome - predictions

    RSS = (residuals * residuals).sum()
    return(RSS) 

In [36]:
get_residual_sum_of_squares(model1, test[model1_features], test['price'])

225500469795490.22

In [37]:
get_residual_sum_of_squares(model2, test[model2_features], test['price'])

223377462976467.12

In [38]:
get_residual_sum_of_squares(model3, test[model3_features], test['price'])

259236319207178.66