In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from google.colab import drive
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

kc_house_data = pd.read_csv('gdrive/My Drive/uwml/kc_house_data.csv', dtype=dtype_dict)
testing = pd.read_csv('gdrive/My Drive/uwml/wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('gdrive/My Drive/uwml/wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('gdrive/My Drive/uwml/wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [4]:
sales = kc_house_data.copy()
sales

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180.0,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.00,770.0,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960.0,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680.0,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,0263000018,20140521T000000,360000.0,3.0,2.50,1530.0,1131,3.0,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346,1530.0,1509.0
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310.0,5813,2.0,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362,1830.0,7200.0
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020.0,1350,2.0,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020.0,2007.0
21611,0291310100,20150116T000000,400000.0,3.0,2.50,1600.0,2388,2.0,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410.0,1287.0


In [5]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [6]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [7]:
# Step 3 / Q. 1
pd.DataFrame({'feature': ['intercept']+all_features, 'coefficients': np.insert(model_all.coef_, 0, model_all.intercept_)})

Unnamed: 0,feature,coefficients
0,intercept,-218136.214035
1,bedrooms,0.0
2,bedrooms_square,0.0
3,bathrooms,0.0
4,sqft_living,134.439314
5,sqft_living_sqrt,0.0
6,sqft_lot,0.0
7,sqft_lot_sqrt,0.0
8,floors,0.0
9,floors_square,0.0


In [8]:
# Step 4
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [9]:
# step 5
l1_penalty_list = np.logspace(1, 7, num=13)
l1_penalty_list

array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07])

In [10]:
# Create & fit models in loop
counter = 1
model_dict = {}
for penalty in l1_penalty_list:
  model_dict["model_{0}_fit".format(counter)] = linear_model.Lasso(alpha=penalty, normalize=True).fit(training[all_features], training['price'])
  counter+=1

In [11]:
# Predict on validation data and return RSS
counter = 1
rss_dict = {}
for i in range(1,14):
  yhat = model_dict["model_{0}_fit".format(counter)].predict(validation[all_features])
  y = validation['price']
  rss_dict["model_{0}_rss".format(counter)] = sum((yhat-y)**2)/1e12
  counter+=1

In [12]:
# Step 6 / Q. 2
rss_dict

{'model_10_rss': 1222.5068594271568,
 'model_11_rss': 1222.5068594271568,
 'model_12_rss': 1222.5068594271568,
 'model_13_rss': 1222.5068594271568,
 'model_1_rss': 398.21332730013444,
 'model_2_rss': 399.0419002533484,
 'model_3_rss': 429.7916040725582,
 'model_4_rss': 463.7398310451194,
 'model_5_rss': 645.8987336338032,
 'model_6_rss': 1222.5068594271568,
 'model_7_rss': 1222.5068594271568,
 'model_8_rss': 1222.5068594271568,
 'model_9_rss': 1222.5068594271568}

In [13]:
# Step 8 / Q. 3
np.count_nonzero(model_dict['model_1_fit'].coef_) + np.count_nonzero(model_dict['model_1_fit'].intercept_)

15

In [14]:
# Step 9
long_l1_penalty_list = np.logspace(1, 4, num=20)
long_l1_penalty_list

array([   10.        ,    14.38449888,    20.69138081,    29.76351442,
          42.81332399,    61.58482111,    88.58667904,   127.42749857,
         183.29807108,   263.66508987,   379.26901907,   545.55947812,
         784.75997035,  1128.83789168,  1623.77673919,  2335.72146909,
        3359.81828628,  4832.93023857,  6951.92796178, 10000.        ])

In [15]:
# Create & fit models in loop
counter = 1
long_model_dict = {}
for penalty in long_l1_penalty_list:
  long_model_dict["model_{0}_fit".format(counter)] = linear_model.Lasso(alpha=penalty, normalize=True).fit(training[all_features], training['price'])
  counter+=1

In [16]:
for key, value in long_model_dict.items():
  print(key, ': number of non zero coefficients : ', np.count_nonzero(value.coef_) + np.count_nonzero(value.intercept_))

model_1_fit : number of non zero coefficients :  15
model_2_fit : number of non zero coefficients :  15
model_3_fit : number of non zero coefficients :  15
model_4_fit : number of non zero coefficients :  15
model_5_fit : number of non zero coefficients :  13
model_6_fit : number of non zero coefficients :  12
model_7_fit : number of non zero coefficients :  11
model_8_fit : number of non zero coefficients :  10
model_9_fit : number of non zero coefficients :  7
model_10_fit : number of non zero coefficients :  6
model_11_fit : number of non zero coefficients :  6
model_12_fit : number of non zero coefficients :  6
model_13_fit : number of non zero coefficients :  5
model_14_fit : number of non zero coefficients :  3
model_15_fit : number of non zero coefficients :  3
model_16_fit : number of non zero coefficients :  2
model_17_fit : number of non zero coefficients :  1
model_18_fit : number of non zero coefficients :  1
model_19_fit : number of non zero coefficients :  1
model_20_fit 

In [17]:
max_non_zeros = 7
l1_penalty_min = long_l1_penalty_list[7]
l1_penalty_max = long_l1_penalty_list[9]

In [18]:
# Step 13 / Q. 4
print(l1_penalty_min)
print(l1_penalty_max)

127.42749857031335
263.6650898730358


In [19]:
#Step 14
sevens_l1_penalty_list = np.linspace(l1_penalty_min, l1_penalty_max, num=20)
sevens_l1_penalty_list

array([127.42749857, 134.59789811, 141.76829765, 148.9386972 ,
       156.10909674, 163.27949628, 170.44989582, 177.62029537,
       184.79069491, 191.96109445, 199.13149399, 206.30189354,
       213.47229308, 220.64269262, 227.81309216, 234.9834917 ,
       242.15389125, 249.32429079, 256.49469033, 263.66508987])

In [20]:
# Create & fit models in loop
counter = 1
sevens_model_dict = {}
for penalty in sevens_l1_penalty_list:
  sevens_model_dict["sevens_model_{0}_fit".format(counter)] = linear_model.Lasso(alpha=penalty, normalize=True).fit(training[all_features], training['price'])
  counter+=1

In [21]:
# Predict on validation data and return RSS
counter = 1
sevens_rss_dict = {}
for i in range(1,14):
  yhat = sevens_model_dict["sevens_model_{0}_fit".format(counter)].predict(validation[all_features])
  y = validation['price']
  sevens_rss_dict["sevens_model_{0}_rss".format(counter)] = sum((yhat-y)**2)/1e12
  counter+=1

In [22]:
# Step 15 / Q. 5
sevens_rss_dict

{'sevens_model_10_rss': 444.2397805261406,
 'sevens_model_11_rss': 445.23073984261407,
 'sevens_model_12_rss': 446.26889686470537,
 'sevens_model_13_rss': 447.11291943464187,
 'sevens_model_1_rss': 435.37467710267896,
 'sevens_model_2_rss': 437.0092291244734,
 'sevens_model_3_rss': 438.23612838691145,
 'sevens_model_4_rss': 439.15893779965984,
 'sevens_model_5_rss': 440.0373652633169,
 'sevens_model_6_rss': 440.7774896416052,
 'sevens_model_7_rss': 441.5666980901388,
 'sevens_model_8_rss': 442.4064131886657,
 'sevens_model_9_rss': 443.2967168743129}

In [23]:
for key, value in sevens_model_dict.items():
  print(key, ': number of non zero coefficients : ', np.count_nonzero(value.coef_) + np.count_nonzero(value.intercept_))

sevens_model_1_fit : number of non zero coefficients :  10
sevens_model_2_fit : number of non zero coefficients :  10
sevens_model_3_fit : number of non zero coefficients :  8
sevens_model_4_fit : number of non zero coefficients :  8
sevens_model_5_fit : number of non zero coefficients :  7
sevens_model_6_fit : number of non zero coefficients :  7
sevens_model_7_fit : number of non zero coefficients :  7
sevens_model_8_fit : number of non zero coefficients :  7
sevens_model_9_fit : number of non zero coefficients :  7
sevens_model_10_fit : number of non zero coefficients :  7
sevens_model_11_fit : number of non zero coefficients :  7
sevens_model_12_fit : number of non zero coefficients :  6
sevens_model_13_fit : number of non zero coefficients :  6
sevens_model_14_fit : number of non zero coefficients :  6
sevens_model_15_fit : number of non zero coefficients :  6
sevens_model_16_fit : number of non zero coefficients :  6
sevens_model_17_fit : number of non zero coefficients :  6
seve

Answer is model 5

In [24]:
# Step 16 / Q. 6
pd.DataFrame({'feature': ['intercept']+all_features, 'coefficients': np.insert(sevens_model_dict["sevens_model_5_fit"].coef_, 0, sevens_model_dict["sevens_model_5_fit"].intercept_)})

Unnamed: 0,feature,coefficients
0,intercept,4422190.0
1,bedrooms,-0.0
2,bedrooms_square,-0.0
3,bathrooms,10610.89
4,sqft_living,163.3803
5,sqft_living_sqrt,0.0
6,sqft_lot,-0.0
7,sqft_lot_sqrt,-0.0
8,floors,0.0
9,floors_square,0.0


In [25]:
sevens_model_dict["sevens_model_5_fit"].coef_

array([-0.00000000e+00, -0.00000000e+00,  1.06108903e+04,  1.63380252e+02,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  5.06451687e+05,  4.19600436e+04,  0.00000000e+00,
        1.16253554e+05,  0.00000000e+00,  0.00000000e+00, -2.61223488e+03,
        0.00000000e+00])