In [1]:
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 
              'sqft_living15':float, 'grade':int, 'yr_renovated':int, 
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float,
              'sqft_lot15':float, 'sqft_living':float, 'floors':float,
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [3]:
from math import log, sqrt

In [4]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [5]:
from sklearn import linear_model

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True)
model_all.fit(sales[all_features], sales['price'])

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [10]:
for feature, coe in filter(lambda key_value: key_value[1] != 0,zip(all_features, model_all.coef_)):
    print("feature: %s, coe: %f" % (feature, coe))

feature: sqft_living, coe: 134.439314
feature: view, coe: 24750.004586
feature: grade, coe: 61749.103091


In [32]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [40]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [41]:
import numpy as np

In [42]:
validation_rss = {}

for l1_penalty in np.logspace(1, 7, num=13):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    
    predictions = model.predict(validation[all_features])
    residuals = validation["price"] - predictions
    rss = sum(residuals ** 2)
    validation_rss[l1_penalty] = rss

print(min(validation_rss.items(), key=lambda x: x[1]))

(10.0, 398213327300134.44)


In [43]:
model = linear_model.Lasso(alpha=10, normalize=True)
model.fit(training[all_features], training["price"])
predictions = model.predict(testing[all_features])
residuals = testing["price"] - predictions
rss = sum(residuals ** 2)
print(rss)

9.84674025527e+13


In [44]:
print(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))

15


In [45]:
max_nonzeros = 7

In [47]:
l1_penalty_values = np.logspace(8, 10, num=20)
print(l1_penalty_values)

[  1.00000000e+08   1.27427499e+08   1.62377674e+08   2.06913808e+08
   2.63665090e+08   3.35981829e+08   4.28133240e+08   5.45559478e+08
   6.95192796e+08   8.85866790e+08   1.12883789e+09   1.43844989e+09
   1.83298071e+09   2.33572147e+09   2.97635144e+09   3.79269019e+09
   4.83293024e+09   6.15848211e+09   7.84759970e+09   1.00000000e+10]


In [53]:
coef_dict = {}
for l1_penalty in l1_penalty_values:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training["price"])
    coef_dict[l1_penalty] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

In [54]:
import pprint 
pprint.pprint(coef_dict)

{100000000.0: 1,
 127427498.57031322: 1,
 162377673.91887242: 1,
 206913808.11147901: 1,
 263665089.87303555: 1,
 335981828.62837881: 1,
 428133239.8719396: 1,
 545559478.11685145: 1,
 695192796.17755914: 1,
 885866790.41008317: 1,
 1128837891.6846883: 1,
 1438449888.2876658: 1,
 1832980710.8324375: 1,
 2335721469.0901213: 1,
 2976351441.6313133: 1,
 3792690190.7322536: 1,
 4832930238.5717525: 1,
 6158482110.6602545: 1,
 7847599703.5146227: 1,
 10000000000.0: 1}
