In [38]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from decimal import Decimal
from sklearn.model_selection import train_test_split
%matplotlib notebook

# LOAD DATA

In [39]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
data = pd.read_csv('kc_house_data.csv')

In [40]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [43]:
features = ['bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']
arr = list(data.columns)
arr.remove('price')

In [44]:
X_train, X_test, y_train, y_test = train_test_split(data[arr], data['price'], random_state=0, train_size=.8, test_size=.2)

# TRANSFORM DATA

In [45]:
X_train['bedrooms_squared'] = X_train['bedrooms'] * X_train['bedrooms']
X_test['bedrooms_squared'] = X_test['bedrooms'] * X_test['bedrooms']

In [46]:
X_train['bed_bath_rooms'] = X_train['bedrooms'] * X_train['bathrooms']
X_test['bed_bath_rooms'] = X_test['bedrooms'] * X_test['bathrooms']

In [47]:
X_train['log_sqft_living'] = np.log2(X_train['sqft_living'])
X_test['log_sqft_living'] = np.log2(X_test['sqft_living'])

In [48]:
X_train['lat_plus_long'] = X_train['lat'] + X_train['long']
X_test['lat_plus_long'] = X_test['lat'] + X_test['long']

# SPLIT DATA

In [9]:
#X_train, X_test, y_train, y_test = train_test_split(data[arr], data['price'], random_state=0, train_size=.8, test_size=.2)

In [49]:
np.mean(X_test['bedrooms_squared'])

12.210501966227158

In [50]:
np.mean(X_test['bed_bath_rooms'])

7.447721489706223

In [51]:
np.mean(X_test['log_sqft_living'])

10.892693037711629

In [13]:
np.mean(X_test['lat_plus_long'])

-74.65426069858893

# CREATING MODEL

In [15]:
model1 = linear_model.LinearRegression()

In [16]:
model2 = linear_model.LinearRegression()

In [17]:
model3 = linear_model.LinearRegression()

In [18]:
model1.fit(X_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat','long']], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
model2.fit(X_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [20]:
model3.fit(X_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
model1.coef_

array([ 3.12942010e+02, -5.30962691e+04,  1.47770428e+04,  6.53983343e+05,
       -3.25707336e+05])

In [22]:
model2.coef_

array([ 3.06819573e+02, -1.04604718e+05, -7.01815289e+04,  6.50590952e+05,
       -3.09965751e+05,  2.49441497e+04])

# RSS TRAINING

In [23]:
'%.2E' % Decimal(mean_squared_error(y_train, model1.predict(X_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat','long']])) * len(y_train))

'9.80E+14'

In [24]:
'%.2E' % Decimal(mean_squared_error(y_train, model2.predict(X_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']])) * len(y_train))

'9.71E+14'

In [25]:
'%.2E' % Decimal(mean_squared_error(y_train, model3.predict(X_train[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]))* len(y_train))

'9.14E+14'

# RSS TESTING

In [26]:
'%.2E' % Decimal(mean_squared_error(y_test, model1.predict(X_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat','long']])) * len(y_test))

'2.13E+14'

In [27]:
'%.2E' % Decimal(mean_squared_error(y_test, model2.predict(X_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']])) * len(y_test))

'2.11E+14'

In [28]:
'%.2E' % Decimal(mean_squared_error(y_test, model3.predict(X_test[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']])) * len(y_test))

'2.04E+14'

In [35]:
sales = graphlab.SFrame('kc_house_data.gl/')

In [36]:
train_data,test_data = sales.random_split(.8,seed=0)

In [37]:
t = train_data['bedrooms'] * train_data['bedrooms']
t.mean()

12.174240681086044