Importing Libraries

In [59]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

Importing Data

In [5]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
sales = sales.sort_values(['sqft_living','price'])

print(sales.head())

               id             date     price  bedrooms  bathrooms  \
19452  3980300371  20140926T000000  142000.0       0.0       0.00   
15381  2856101479  20140701T000000  276000.0       1.0       0.75   
860    1723049033  20140620T000000  245000.0       1.0       0.75   
18379  1222029077  20141029T000000  265000.0       0.0       0.75   
4868   6896300380  20141002T000000  228000.0       0.0       1.00   

       sqft_living  sqft_lot  floors  waterfront  view     ...      grade  \
19452        290.0     20875     1.0           0     0     ...          1   
15381        370.0      1801     1.0           0     0     ...          5   
860          380.0     15000     1.0           0     0     ...          5   
18379        384.0    213444     1.0           0     0     ...          4   
4868         390.0      5900     1.0           0     0     ...          4   

       sqft_above  sqft_basement  yr_built  yr_renovated  zipcode      lat  \
19452         290              0      1963  

Function to Add features

In [6]:
def polynomial_sframe(feature,degree):
    
    feature_df = pd.DataFrame()
    feature_df['power_1'] = feature
    
    for i in range(2,degree+1):
        col_name = 'power_' + str(i)
        feature_df[col_name] = feature_df['power_1'].apply(lambda x: x ** i)
    
    return(feature_df)
    

In [7]:
feature, output = sales['sqft_living'], sales['price']

print(feature.head())
print(output.head())

19452    290.0
15381    370.0
860      380.0
18379    384.0
4868     390.0
Name: sqft_living, dtype: float64
19452    142000.0
15381    276000.0
860      245000.0
18379    265000.0
4868     228000.0
Name: price, dtype: float64


Qn 1

In [8]:
feature_15 = polynomial_sframe(feature, 15)

print(feature_15.head())

       power_1   power_2     power_3       power_4       power_5  \
19452    290.0   84100.0  24389000.0  7.072810e+09  2.051115e+12   
15381    370.0  136900.0  50653000.0  1.874161e+10  6.934396e+12   
860      380.0  144400.0  54872000.0  2.085136e+10  7.923517e+12   
18379    384.0  147456.0  56623104.0  2.174327e+10  8.349416e+12   
4868     390.0  152100.0  59319000.0  2.313441e+10  9.022420e+12   

            power_6       power_7       power_8       power_9      power_10  \
19452  5.948233e+14  1.724988e+17  5.002464e+19  1.450715e+22  4.207072e+24   
15381  2.565726e+15  9.493188e+17  3.512479e+20  1.299617e+23  4.808584e+25   
860    3.010936e+15  1.144156e+18  4.347792e+20  1.652161e+23  6.278212e+25   
18379  3.206176e+15  1.231172e+18  4.727699e+20  1.815436e+23  6.971275e+25   
4868   3.518744e+15  1.372310e+18  5.352009e+20  2.087284e+23  8.140406e+25   

           power_11      power_12      power_13      power_14      power_15  
19452  1.220051e+27  3.538148e+29  1.0

Ridge Regression

In [9]:
l2_small_penalty = 1.5e-5

In [10]:
model = Ridge(alpha=l2_small_penalty, normalize=True)

In [11]:
model.fit(feature_15, pd.DataFrame(output))

Ridge(alpha=1.5e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [12]:
model.coef_

array([[ 1.24873306e+02, -4.77376011e-02,  3.01446238e-05,
        -2.44419942e-09, -1.94153675e-13,  8.54085686e-18,
         1.51142121e-21,  8.27979094e-26,  6.52603100e-31,
        -3.27895017e-34, -3.87962315e-38, -2.72437650e-42,
        -1.07790800e-46,  3.78242694e-51,  1.39790296e-54]])

Qn 2,3

In [14]:
set_1 = pd.read_csv('wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv('wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv('wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv('wk3_kc_house_set_4_data.csv', dtype=dtype_dict)

In [15]:
l2_small_penalty=1e-9

In [16]:
set_1_15 = polynomial_sframe(set_1['sqft_living'], 15)
set_2_15 = polynomial_sframe(set_2['sqft_living'], 15)
set_3_15 = polynomial_sframe(set_3['sqft_living'], 15)
set_4_15 = polynomial_sframe(set_4['sqft_living'], 15)

In [17]:
model = Ridge(alpha=l2_small_penalty, normalize=True)

In [18]:
### Model1

model.fit(set_1_15, pd.DataFrame(set_1['price']))
model.coef_

array([[ 5.44669399e+02, -3.55447605e-01,  1.22446380e-04,
        -1.17175307e-08, -3.90512698e-13, -1.39075910e-17,
         1.47860283e-20,  6.87491630e-25, -7.57203971e-29,
        -1.04097276e-32, -3.71844269e-37,  3.39989255e-41,
         5.56592051e-45,  2.53761389e-49, -3.35152915e-53]])

In [19]:
### Model2

model.fit(set_2_15, pd.DataFrame(set_2['price']))
model.coef_

array([[ 8.59362651e+02, -8.18118278e-01,  4.28879983e-04,
        -9.12770660e-08, -2.69604404e-12,  3.73980300e-15,
        -1.42711882e-19, -6.30794703e-23, -1.44559628e-27,
         7.44321353e-31,  9.25865883e-35,  3.28010472e-41,
        -1.29543508e-42, -1.38781255e-46,  1.66546444e-50]])

In [20]:
### Model3

model.fit(set_3_15, pd.DataFrame(set_3['price']))
model.coef_

array([[-7.55395963e+02,  9.75579541e-01, -4.58946006e-04,
         7.77958112e-08,  7.15013417e-12, -2.88602002e-15,
        -2.13677720e-20,  3.38085194e-23,  2.19178226e-27,
        -1.97067793e-31, -4.15993202e-35, -1.80196143e-39,
         3.19071198e-43,  5.08456859e-47, -3.93304243e-51]])

In [21]:
### Model4

model.fit(set_4_15, pd.DataFrame(set_4['price']))
model.coef_

array([[ 1.11944571e+03, -9.83760212e-01,  3.38770897e-04,
         3.60377210e-08, -4.37814017e-11,  5.77191690e-15,
         7.66795221e-19, -9.49297664e-23, -1.96030821e-26,
        -2.10880284e-32,  3.31005065e-34,  3.47733891e-38,
        -2.43039323e-42, -8.79553219e-46,  6.44569659e-50]])

Qn 4,5

In [22]:
l2_large_penalty=1.23e2

In [23]:
model = Ridge(alpha=l2_large_penalty, normalize=True)

In [24]:
### Model1

model.fit(set_1_15, pd.DataFrame(set_1['price']))
model.coef_

array([[2.32806803e+00, 3.53621608e-04, 3.31969692e-08, 2.00082477e-12,
        1.11492559e-16, 6.57786122e-21, 4.12939525e-25, 2.70393755e-29,
        1.81614763e-33, 1.23824277e-37, 8.51872481e-42, 5.89455598e-46,
        4.09542560e-50, 2.85464889e-54, 1.99547476e-58]])

In [25]:
### Model2

model.fit(set_2_15, pd.DataFrame(set_2['price']))
model.coef_

array([[2.09756903e+00, 3.90817483e-04, 6.67189944e-08, 8.90002997e-12,
        9.72639877e-16, 9.69733682e-20, 9.50564475e-24, 9.44491031e-28,
        9.57191338e-32, 9.86945155e-36, 1.03101115e-39, 1.08729784e-43,
        1.15453748e-47, 1.23211305e-51, 1.31986696e-55]])

In [26]:
### Model3

model.fit(set_3_15, pd.DataFrame(set_3['price']))
model.coef_

array([[2.28906258e+00, 4.12472190e-04, 6.08835345e-08, 6.58572163e-12,
        6.15278155e-16, 5.64446634e-20, 5.28834396e-24, 5.07091402e-28,
        4.94657273e-32, 4.88043809e-36, 4.85009106e-40, 4.84161534e-44,
        4.84635021e-48, 4.85883628e-52, 4.87558469e-56]])

In [27]:
### Model4

model.fit(set_4_15, pd.DataFrame(set_4['price']))
model.coef_

array([[2.08596194e+00, 4.05035772e-04, 7.46864647e-08, 1.13096608e-11,
        1.45864442e-15, 1.73561251e-19, 2.01609632e-23, 2.34605255e-27,
        2.75636073e-31, 3.27043069e-35, 3.91046855e-39, 4.70118041e-43,
        5.67212304e-47, 6.85958087e-51, 8.30843630e-55]])

Qn 6

In [28]:
train_valid_shuffled = pd.read_csv('wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
test = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)

print(train_valid_shuffled.head())
print(test.head())

           id             date     price  bedrooms  bathrooms  sqft_living  \
0  2780400035  20140505T000000  665000.0       4.0       2.50       2800.0   
1  1703050500  20150321T000000  645000.0       3.0       2.50       2490.0   
2  5700002325  20140605T000000  640000.0       3.0       1.75       2340.0   
3  0475000510  20141118T000000  594000.0       3.0       1.00       1320.0   
4  0844001052  20150128T000000  365000.0       4.0       2.50       1904.0   

   sqft_lot  floors  waterfront  view     ...      grade  sqft_above  \
0      5900     1.0           0     0     ...          8        1660   
1      5978     2.0           0     0     ...          9        2490   
2      4206     1.0           0     0     ...          7        1170   
3      5000     1.0           0     0     ...          7        1090   
4      8200     2.0           0     0     ...          7        1904   

   sqft_basement  yr_built  yr_renovated  zipcode      lat     long  \
0           1140      1963 

In [32]:
n = len(train_valid_shuffled)
k = 10 # 10-fold cross-validation

for i in range(k):
    start = int((n*i)/k)
    end = int((n*(i+1))/k-1)
    print (i, (start, end))

0 (0, 1938)
1 (1939, 3878)
2 (3879, 5817)
3 (5818, 7757)
4 (7758, 9697)
5 (9698, 11636)
6 (11637, 13576)
7 (13577, 15515)
8 (15516, 17455)
9 (17456, 19395)


In [63]:
def k_fold_cross_validation(k, l2_penalty, data, output):
    rss=[]
    for i in range(k-1):
        
        output = pd.DataFrame(output)
        
        start = int((n*i)/k)
        end = int((n*(i+1))/k-1)
        
        train_15 = pd.concat([data.loc[:start,:],data.loc[end:,:]])
        valid = data.loc[start:end,:]
        
        train_out = pd.concat([output.loc[:start,:],output.loc[end:,:]])
        valid_out = output.loc[start:end]
        
        model = Ridge(alpha=l2_penalty, normalize=True)
        model.fit(train_15, pd.DataFrame(train_out))
        
        predict = model.predict(valid)
        
        rss.append(mean_squared_error(valid_out, predict))
        
    return np.mean(rss)

In [31]:
train_valid_shuffled_15 = polynomial_sframe(train_valid_shuffled['sqft_living'], 15)

print(train_valid_shuffled_15.head())

   power_1    power_2       power_3       power_4       power_5       power_6  \
0   2800.0  7840000.0  2.195200e+10  6.146560e+13  1.721037e+17  4.818903e+20   
1   2490.0  6200100.0  1.543825e+10  3.844124e+13  9.571869e+16  2.383395e+20   
2   2340.0  5475600.0  1.281290e+10  2.998220e+13  7.015834e+16  1.641705e+20   
3   1320.0  1742400.0  2.299968e+09  3.035958e+12  4.007464e+15  5.289853e+18   
4   1904.0  3625216.0  6.902411e+09  1.314219e+13  2.502273e+16  4.764328e+19   

        power_7       power_8       power_9      power_10      power_11  \
0  1.349293e+24  3.778020e+27  1.057846e+31  2.961968e+34  8.293509e+37   
1  5.934654e+23  1.477729e+27  3.679545e+30  9.162067e+33  2.281355e+37   
2  3.841590e+23  8.989320e+26  2.103501e+30  4.922192e+33  1.151793e+37   
3  6.982606e+21  9.217040e+24  1.216649e+28  1.605977e+31  2.119890e+34   
4  9.071281e+22  1.727172e+26  3.288535e+29  6.261371e+32  1.192165e+36   

       power_12      power_13      power_14      power_15  
0 

In [94]:
penalty = np.logspace(3, 9, num=13)
rss_penalty = []

for item in penalty:
    rss_penalty.append(k_fold_cross_validation(10, item, train_valid_shuffled_15, train_valid_shuffled['price']))

In [101]:
index = rss_penalty.index(min(rss_penalty))
print(penalty[index])

1000.0


Qn 7

In [75]:
test_15 = polynomial_sframe(test['sqft_living'], 15)

print(test_15.head())

   power_1    power_2       power_3       power_4       power_5       power_6  \
0   1890.0  3572100.0  6.751269e+09  1.275990e+13  2.411621e+16  4.557963e+19   
1   1810.0  3276100.0  5.929741e+09  1.073283e+13  1.942642e+16  3.516183e+19   
2   1200.0  1440000.0  1.728000e+09  2.073600e+12  2.488320e+15  2.985984e+18   
3   2330.0  5428900.0  1.264934e+10  2.947296e+13  6.867199e+16  1.600057e+20   
4   1220.0  1488400.0  1.815848e+09  2.215335e+12  2.702708e+15  3.297304e+18   

        power_7       power_8       power_9      power_10      power_11  \
0  8.614551e+22  1.628150e+26  3.077204e+29  5.815915e+32  1.099208e+36   
1  6.364291e+22  1.151937e+26  2.085005e+29  3.773860e+32  6.830686e+35   
2  3.583181e+21  4.299817e+24  5.159780e+27  6.191736e+30  7.430084e+33   
3  3.728133e+23  8.686551e+26  2.023966e+30  4.715842e+33  1.098791e+37   
4  4.022711e+21  4.907707e+24  5.987403e+27  7.304631e+30  8.911650e+33   

       power_12      power_13      power_14      power_15  
0 

In [87]:
model = Ridge(alpha=1000, normalize=True)

In [88]:
model.fit(train_valid_shuffled_15, pd.DataFrame(train_valid_shuffled['price']))

Ridge(alpha=1000, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [89]:
predict = model.predict(test_15)

predict

array([[538912.62911772],
       [538869.40381503],
       [538576.80910253],
       ...,
       [539536.98628839],
       [538551.27510185],
       [538581.11747518]])

In [90]:
n = len(predict)

n

2217

In [91]:
rss = mean_squared_error(test['price'],predict) * n

rss/10e13

2.8385686122415086