In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column to a dataframe
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of dataframe given by the ‘features’ list into the SFrame ‘features_sframe’

    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = data[features].as_matrix(columns=None)
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’

    # this will convert the SArray into a numpy array:
    output_array = data[output].as_matrix(columns=None) 
    return(features_matrix, output_array)

In [3]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [5]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv.zip', dtype=dtype_dict)
train = pd.read_csv('kc_house_train_data.csv.zip', dtype=dtype_dict)
test = pd.read_csv('kc_house_test_data.csv.zip', dtype=dtype_dict)

In [6]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features,norms)

In [12]:
normalize_features(np.array([[3,6,9],[4,8,12]]))

(array([[0.6, 0.6, 0.6],
        [0.8, 0.8, 0.8]]), array([ 5., 10., 15.]))

In [17]:
feats = ['sqft_living','bedrooms']
output = 'price'
feat_mat , out = get_numpy_data(sales,feats,output)
feat_mat_normalized , norms = normalize_features(features=feat_mat)
init_weights = [1.,4.,1.]
init_pred = predict_outcome(feat_mat_normalized,init_weights)
P = list()
for i in range(len(feats)+1):
    x = np.dot(feat_mat_normalized[:,i],(out - init_pred + (init_weights[i]*feat_mat_normalized[:,i])))
    P.append(x)
print(P)

[79400300.01452288, 87939470.82325178, 80966698.66623947]


In [20]:
range_l1penalty = [2*P[2], 2*P[1]]
range_l1penalty

[161933397.33247894, 175878941.64650357]

In [22]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_outcome(feature_matrix, weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = np.dot(feature_matrix[:,i],(output - prediction + weights[i]*feature_matrix[:,i]))
    
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0.
        
    return new_weight_i

In [24]:
# should print 0.425558846691
import math
print (lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))

0.4255588466910251


In [40]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    max_change = tolerance+1
    weights = initial_weights
    while (max_change > tolerance):
        max_change = 0
        for i in range(len(weights)):
            new_weight = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            if (np.abs(new_weight-weights[i]) > max_change ):
                max_change = np.abs(new_weight-weights[i])
            weights[i] = new_weight
    return weights

In [99]:
initial_weights = np.zeros(3)
L1_penalty = 1e7
Tolerance = 1.0
out = sales['price']
weights = lasso_cyclical_coordinate_descent(feat_mat_normalized,out,initial_weights,L1_penalty,Tolerance)
print(weights)

[21624997.95951901 63157247.20788965        0.        ]


In [42]:
def RSS(out , pred):
    return np.sum((out-pred)**2)

In [44]:
RSS (out,predict_outcome(feat_mat_normalized,weights))

1630492476715385.5

In [45]:
features = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition','grade','sqft_above','sqft_basement','yr_built','yr_renovated']

In [68]:
output = 'price'
train_feat_mat , out = get_numpy_data(train,features,output)
train_feat_norm , norms = normalize_features(train_feat_mat)

In [76]:
l1_penalty = 1e7
initialize_weights = np.zeros(14)
tolerance = 1.0

In [77]:
weights1e7 = lasso_cyclical_coordinate_descent(train_feat_norm, out,initialize_weights, l1_penalty, tolerance)

In [78]:
pd.Series(weights1e7,index=['intercept']+features)

intercept        2.442960e+07
bedrooms         0.000000e+00
bathrooms        0.000000e+00
sqft_living      4.838917e+07
sqft_lot         0.000000e+00
floors           0.000000e+00
waterfront       3.317511e+06
view             7.329962e+06
condition        0.000000e+00
grade            0.000000e+00
sqft_above       0.000000e+00
sqft_basement    0.000000e+00
yr_built         0.000000e+00
yr_renovated     0.000000e+00
dtype: float64

In [88]:
l1_penalty = 1e8
initialize_weights = np.zeros(14)
tolerance = 1.0

In [89]:
weights1e8 = lasso_cyclical_coordinate_descent(train_feat_norm, out,initialize_weights, l1_penalty, tolerance)

In [81]:
pd.Series(weights1e8,index=['intercept']+features)

intercept        7.111463e+07
bedrooms         0.000000e+00
bathrooms        0.000000e+00
sqft_living      0.000000e+00
sqft_lot         0.000000e+00
floors           0.000000e+00
waterfront       0.000000e+00
view             0.000000e+00
condition        0.000000e+00
grade            0.000000e+00
sqft_above       0.000000e+00
sqft_basement    0.000000e+00
yr_built         0.000000e+00
yr_renovated     0.000000e+00
dtype: float64

In [82]:
l1_penalty = 1e4
tolerance = 5e5
weights1e4 = lasso_cyclical_coordinate_descent(train_feat_norm, out,initialize_weights, l1_penalty, tolerance)

In [83]:
pd.Series(weights1e4, index=['intercept']+features)

intercept        7.856474e+07
bedrooms        -2.209740e+07
bathrooms        1.279107e+07
sqft_living      9.380809e+07
sqft_lot        -2.013173e+06
floors          -4.219185e+06
waterfront       6.482843e+06
view             7.127409e+06
condition        5.001665e+06
grade            1.432752e+07
sqft_above      -1.577096e+07
sqft_basement   -5.159591e+06
yr_built        -8.449534e+07
yr_renovated     2.824439e+06
dtype: float64

In [92]:
weights1e7_normalized = weights1e7 / norms
weights1e8_normalized = weights1e8 / norms
weights1e4_normalized = weights1e4 / norms
weights1e7_normalized[3]

161.3174576461177

In [94]:
(test_feature_matrix, test_output) = get_numpy_data(test, features, 'price')

In [96]:
print (sum((test_output - predict_outcome(test_feature_matrix,weights1e4_normalized) )**2))

228459958971392.7


In [97]:
print (sum((test_output - predict_outcome(test_feature_matrix,weights1e7_normalized) )**2))
print (sum((test_output - predict_outcome(test_feature_matrix,weights1e8_normalized) )**2))

275962075920366.56
537166151497321.94
