Importing Libraries

In [35]:
import pandas as pd
import numpy as np
from math import log, sqrt
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

Import Data

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [4]:
sales = pd.read_csv('kc_house_data.csv ', dtype=dtype_dict)
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [6]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,sqft_living_sqrt,sqft_lot_sqrt,bedrooms_square,floors_square
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,34.351128,75.166482,9.0,1.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,50.695167,85.099941,9.0,4.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,27.748874,100.0,4.0,1.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,44.271887,70.710678,16.0,1.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,40.987803,89.88882,9.0,1.0


Model

In [23]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [8]:
model_all = Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

Qn 1

In [27]:
model_all.coef_ 

array([     0.        ,      0.        ,      0.        ,    134.43931396,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,  24750.00458561,      0.        ,
        61749.10309071,      0.        ,      0.        ,     -0.        ,
            0.        ])

In [28]:
all_features[3],all_features[10],all_features[12]

('sqft_living', 'view', 'grade')

Qn 2

In [29]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [30]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

In [31]:
training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

In [32]:
validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [37]:
rss = []

for l1_penalty in np.logspace(1, 7, num=13):
    model = Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    predict = model.predict(validation[all_features])
    rss.append(mean_squared_error(validation['price'], predict))
    
rss

[41329873098.093918,
 41415869253.071808,
 44607327874.681847,
 48130755687.090935,
 67036713402.574013,
 126881874356.73721,
 126881874356.73721,
 126881874356.73721,
 126881874356.73721,
 126881874356.73721,
 126881874356.73721,
 126881874356.73721,
 126881874356.73721]

In [40]:
rss.index(min(rss))

0

In [43]:
best_penalty = np.logspace(1, 7, num=13)[0]

Qn 3

In [44]:
model = Lasso(alpha=best_penalty, normalize=True)
model.fit(training[all_features], training['price'])
model.coef_

array([ -1.61445628e+04,   3.73245384e+02,   5.08412433e+04,
         6.17853560e+02,  -4.44113549e+04,   7.85623065e-01,
        -7.01194765e+02,  -0.00000000e+00,   5.01420046e+03,
         6.19488752e+05,   3.80418557e+04,   2.49987718e+04,
         1.28716235e+05,   0.00000000e+00,   0.00000000e+00,
        -3.29383118e+03,   1.00573209e+01])

In [45]:
model.intercept_

6630155.6686283601

Qn 4

In [87]:
max_nonzeros = 7

In [88]:
non_zero = []

for l1_penalty in np.logspace(1, 4, num=20):
    model = Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    non_zero.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
    
non_zero

[15, 15, 15, 15, 13, 12, 11, 10, 7, 6, 6, 6, 5, 3, 3, 2, 1, 1, 1, 1]

In [91]:
print(non_zero.index(10))
print(non_zero.index(6))

7
9


In [96]:
l1_penalty_max = np.logspace(1, 4, num=20)[9]
l1_penalty_min = np.logspace(1, 4, num=20)[7]

print(l1_penalty_min)
print(l1_penalty_max)

127.42749857
263.665089873


Qn 5

In [98]:
non_zero = []
rss = []

for l1_penalty in np.linspace(l1_penalty_min, l1_penalty_max, num=20):
    model = Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    predict = model.predict(validation[all_features])
    rss.append(mean_squared_error(validation['price'], predict))
    non_zero.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))
    
print(non_zero)
print(rss)

[10, 10, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6]
[45186785376.510712, 45356432706.224319, 45483770460.499443, 45579547254.764923, 45670717723.229538, 45747533953.461884, 45829444534.524124, 45916597113.509727, 46009000194.531929, 46106879141.270531, 46209729096.275475, 46317477619.58551, 46405077263.584915, 46496957742.767509, 46593119530.176964, 46693562625.813293, 46798287366.696754, 46907361354.954544, 47020646016.356964, 47138211693.060226]


In [105]:
rss.index(np.min(rss[4:11]))

4

In [109]:
best_l1_penalty = np.linspace(l1_penalty_min, l1_penalty_max, num=20)[4]

In [110]:
model = Lasso(alpha=best_l1_penalty, normalize=True)
model.fit(training[all_features], training['price'])

Lasso(alpha=156.10909673930755, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [111]:
model.coef_

array([ -0.00000000e+00,  -0.00000000e+00,   1.06108903e+04,
         1.63380252e+02,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.06451687e+05,   4.19600436e+04,   0.00000000e+00,
         1.16253554e+05,   0.00000000e+00,   0.00000000e+00,
        -2.61223488e+03,   0.00000000e+00])

In [113]:
model.intercept_

4422190.2791203512

In [114]:
all_features[2], all_features[3], all_features[9], all_features[10], all_features[12], all_features[15]

('bathrooms', 'sqft_living', 'waterfront', 'view', 'grade', 'yr_built')