In [1]:
import numpy as np
import pandas as pd 
%matplotlib inline 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [7]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

# Learning a multiple regression model

In [34]:
def linear_reg(train_x,train_y,test_x,test_y):
    reg = linear_model.LinearRegression()
    model = reg.fit(train_x,train_y)
    coef = model.coef_
    intercept = model.intercept_
    predictions = model.predict(test_x)
    m_s_e = mean_squared_error(test_y, predictions)
    r2 = r2_score(test_y, predictions)
    return(model, coef, intercept, m_s_e, r2_score)

In [4]:
features = ['sqft_living', 'bedrooms', 'bathrooms']

In [6]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
reg = linear_model.LinearRegression()
model = reg.fit(train_data[features], train_data['price'])

In [11]:
model.coef_

array([   315.40669062, -65081.88711588,   6942.16598637])

In [12]:
model.intercept_

87912.86581496493

In [13]:
predictions = model.predict(test_data[features])

In [14]:
mean_squared_error(test_data['price'], predictions)

64734438539.402626

In [15]:
r2_score(test_data['price'], predictions)

0.490304353210425

In [17]:
## flixable house_price prediction
model.predict([[2500, 4, 2]])

array([629986.3758629])

### Create new features

In [19]:
from math import log

In [21]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)
train_data['bed_bath_rooms'] = train_data['bedrooms']*train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms']*test_data['bathrooms']
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))
train_data['lat_plus_long'] = train_data['lat']+train_data['long']
test_data['lat_plus_long'] = test_data['lat']+test_data['long']

In [24]:
print('Test data bedrooms squared mean: '+str(test_data['bedrooms_squared'].mean()))
print('Test data bed_bath_rooms mean: '+str(test_data['bed_bath_rooms'].mean()))
print('Test data log_sqft_living mean: '+str(test_data['log_sqft_living'].mean()))
print('Test data lat_plus_long mean: '+str(test_data['lat_plus_long'].mean()))

Test data bedrooms squared mean: 12.4466777015843
Test data bed_bath_rooms mean: 7.5039016315913925
Test data log_sqft_living mean: 7.550274679645921
Test data lat_plus_long mean: -74.65333355403185


## Learning Multiple Models

### Model 1: squarefeet, # bedrooms, # bathrooms, latitude & longitude

In [27]:
features_1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']

In [51]:
reg = linear_model.LinearRegression()
model_1 = reg.fit(train_data[features_1], train_data['price'])
predictions_1 = model_1.predict(test_data[features_1])
print(mean_squared_error(test_data['price'], predictions_1))
print('Coefitionts: '+str(model_1.coef_))
print('Intercept: '+str(model_1.intercept_))

53322409504.726974
Coefitionts: [ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
 -3.09374351e+05]
Intercept: -69075726.7925697


In [56]:
train_predictions_1 = model_1.predict(train_data[features_1])
mean_squared_error(train_data['price'], train_predictions_1)

55676481997.78795

### Model 2: squarefeet, # bedrooms, # bathrooms, latitude & longitude, bedrooms*bathrooms

In [36]:
features_2 = features_1 + ['bed_bath_rooms']

In [53]:
reg = linear_model.LinearRegression()
model_2 = reg.fit(train_data[features_2], train_data['price'])
predictions_2 = model_2.predict(test_data[features_2])
print(mean_squared_error(test_data['price'], predictions_2))
print('Coefitionts: '+str(model_2.coef_))
print('Intercept: '+str(model_2.intercept_))

52820397960.85768
Coefitionts: [ 3.06610053e+02 -1.13446368e+05 -7.14613083e+04  6.54844630e+05
 -2.94298969e+05  2.55796520e+04]
Intercept: -66867968.8710788


In [57]:
train_predictions_2 = model_2.predict(train_data[features_2])
mean_squared_error(train_data['price'], train_predictions_2)

55132284576.28094

### Model 3: squarefeet, # bedrooms, # bathrooms, latitude & longitude, log squarefeet, bedrooms squared, and the (nonsensical) latitude + longitude

In [38]:
features_3 = features_2 + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [54]:
reg = linear_model.LinearRegression()
model_3 = reg.fit(train_data[features_3], train_data['price'])
predictions_3 = model_3.predict(test_data[features_3])
print(mean_squared_error(test_data['price'], predictions_3))
print('Coefitionts: '+str(model_3.coef_))
print('Intercept: '+str(model_3.intercept_))

61299673494.249176
Coefitionts: [ 5.29422820e+02  3.45142296e+04  6.70607813e+04  5.34085611e+05
 -4.06750711e+05 -8.57050439e+03 -6.78858667e+03 -5.61831484e+05
  1.27334900e+05]
Intercept: -62036084.98609828


In [59]:
train_predictions_3 = model_3.predict(train_data[features_3])
mean_squared_error(train_data['price'], train_predictions_3)

51969423323.19817

In [62]:
53322409504.726974 > 52820397960.85768

True