In [1]:
import graphlab

In [2]:
# Loading the data

In [3]:
sales = graphlab.SFrame('kc_house_data.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1478213833.log


This non-commercial license of GraphLab Create for academic use is assigned to emechebe@ohsu.edu and will expire on June 11, 2017.


In [4]:
# We now select features that we want to use
# In this example, we decided to do some transformation on the inputs of the data and use them 
# as features
# We use the sqrt of the sqft_living
# Sqrt of sqft_lot
# Square of bedroom

In [5]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']

In [6]:
# In the dataset, 'floors' was defined with type string

In [7]:
sales['floors']

dtype: str
Rows: 21613
['1', '2', '1', '1', '1', '1', '2', '1', '1', '2', '1', '1', '1.5', '1', '1.5', '2', '2', '1.5', '1', '1', '1', '1', '2', '1', '2', '1.5', '2', '1.5', '1', '2', '2', '3', '1.5', '1.5', '1', '1.5', '1', '2', '1', '2', '2', '1', '2', '2', '1', '2', '1', '2', '1', '1', '1', '1', '1', '1', '1.5', '2', '2', '2', '2', '2', '1', '1', '2', '3', '1', '1.5', '2', '2', '1', '1', '1', '1', '1.5', '2', '1', '2', '1', '1', '1', '2', '2', '2', '1', '1', '1', '2', '1.5', '2', '2', '2', '2', '2', '1', '1', '1', '1', '1', '1', '1', '2', ... ]

In [8]:
# so we'll convert them to float, before creating a new feature

In [9]:
sales['floors'] = sales['floors'].astype(float) 

In [10]:
sales['floors']

dtype: float
Rows: 21613
[1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.5, 1.0, 1.5, 2.0, 2.0, 1.5, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.5, 2.0, 1.5, 1.0, 2.0, 2.0, 3.0, 1.5, 1.5, 1.0, 1.5, 1.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.5, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0, 3.0, 1.0, 1.5, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.5, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.5, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, ... ]

In [11]:
# Now square them

In [12]:
sales['floors_square'] = sales['floors']*sales['floors']

In [13]:
# Learn regression weights with L1 penalty

In [14]:
# Let us fit a model with all the features available, plus the features we just created above.

In [15]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [16]:
# So now lets generate a simple regression model with all the features
# We are going to use the in-built regression algorithm 
# However, we are going to use the L1 penalty (for lasso) instead of L2 penalty (for Ridge)

In [17]:
model_all = graphlab.linear_regression.create(sales, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=1e10)

In [18]:
# Find what features had non-zero weight.

In [19]:
print(model_all.coefficients.print_rows(num_rows=model_all.num_coefficients))

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None |  274873.05595 |  None  |
|     bedrooms     |  None |      0.0      |  None  |
| bedrooms_square  |  None |      0.0      |  None  |
|    bathrooms     |  None | 8468.53108691 |  None  |
|   sqft_living    |  None | 24.4207209824 |  None  |
| sqft_living_sqrt |  None | 350.060553386 |  None  |
|     sqft_lot     |  None |      0.0      |  None  |
|  sqft_lot_sqrt   |  None |      0.0      |  None  |
|      floors      |  None |      0.0      |  None  |
|  floors_square   |  None |      0.0      |  None  |
|    waterfront    |  None |      0.0      |  None  |
|       view       |  None |      0.0      |  None  |
|    condition     |  None |      0.0      |  None  |
|      grade       |  None | 842.068034898 |  None  |
|    sqft_above    |  None | 20.0247224171 |  None  |
|  sqft_basement   |  None |

In [20]:
# 6 features have non-zero weights
# bathrooms, sqft_living, sqft_living_sqrt,grade,sqft_above, intercept

In [21]:
#Note that a majority of the weights have been set to zero. So by setting an L1 penalty that's large enough, we are performing a subset selection.
#QUIZ QUESTION: According to this list of weights, which of the features have been chosen?

In [22]:
#Selecting an L1 penalty
# So how do we select a good L1 penalty to use

In [23]:
# For this we will explore multiple values of L1 and see which one give us the best
# We are going to split our data three way: train, validation and test set

In [24]:
# First split into train and test

In [25]:
(training_and_validation, testing) = sales.random_split(.9,seed=1)

In [26]:
# Now split training_validation into training and validation

In [27]:
(training, validation) = training_and_validation.random_split(0.5, seed=1)

In [28]:
# Now we write a simple for loop that will go through all given values of L1 and:
# 1) Fit a model with the current L1 on our training dataset
# 2) Use that model to make predictions for our validation data set
# 3) Compute the RSS from the predictions of the validation data set
# 4) Repeat until you exhaust all the given values of L1
# 5) Return the L1 that minimizes the RSS
# 6) Return the min RSS
# 7) Note our function, findingbest_l1 takes 2 data set as input: training and validation

In [29]:
import numpy as np

In [30]:
def findingbest_l1(training, validation):
    L1_values=np.logspace(1, 7, num=13) # Getting a list of possible L1 values
    ListofRSS = []  # Initializing an empty list for computed RSS values
    ListofL1 = []   # Initializing an empty list for used L1 values
    for L1 in L1_values:   # Looping through the L1 values and using each to fit a model on the
                           # training data set
        model_all = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=L1, verbose = False)
        
        Predictions = model_all.predict(validation) # Using the model to predict the prices
                                                    # in our validation data set
        Error = Predictions - validation['price']  # Computing the Errors
        SquaredError = Error * Error               # Getting the square of the errors
        RSS = SquaredError.sum()                   # Getting the RSS
        ListofRSS.append(RSS)                      # Compiling a list of the computed RSS
        ListofL1.append(L1)                        # List of corresponding L1 values
    IndexofminRSS= ListofRSS.index(min(ListofRSS)) # Getting the index of the min RSS in RSS list
    print ListofL1[IndexofminRSS]                  # Using that index to get the L1 value
    return min(ListofRSS)                          # Returning the min RSS
    
    
    

In [31]:
findingbest_l1(training,validation)

10.0


625766285142459.9

In [32]:
# The best L1 penalty is 10 and the RSS associated with it using the validation set 
# is 625766285142459.9

In [33]:
# Now that we have selected the best L1 value, we can now use it to generate a model
# and then use that test it on  our test data set to see how it performs

In [34]:
# Getting the model using the L1 selected (10)

In [35]:
model_all = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=10, verbose = False)

In [36]:
# Now using this model to predict the prices from our test data

In [37]:
test_data_predictions=model_all.predict(testing)

In [38]:
# Whats the difference between what we predicted and whats the actual

In [39]:
Error = test_data_predictions - testing['price']

In [40]:
# Get the RSS

In [41]:
SquaredError = Error * Error

In [42]:
RSS = SquaredError.sum()

In [43]:
RSS

156983602381664.2

In [44]:
# What is the RSS on TEST data of the model with the best l1_penalty?
# 156983602381664.2

In [45]:
# QUIZ QUESTION Also, using this value of L1 penalty, how many nonzero weights do you have?

In [46]:
print(model_all.coefficients.print_rows(num_rows=model_all.num_coefficients))

+------------------+-------+------------------+--------+
|       name       | index |      value       | stderr |
+------------------+-------+------------------+--------+
|   (intercept)    |  None |  18993.4272128   |  None  |
|     bedrooms     |  None |  7936.96767903   |  None  |
| bedrooms_square  |  None |  936.993368193   |  None  |
|    bathrooms     |  None |  25409.5889341   |  None  |
|   sqft_living    |  None |  39.1151363797   |  None  |
| sqft_living_sqrt |  None |  1124.65021281   |  None  |
|     sqft_lot     |  None | 0.00348361822299 |  None  |
|  sqft_lot_sqrt   |  None |  148.258391011   |  None  |
|      floors      |  None |   21204.335467   |  None  |
|  floors_square   |  None |  12915.5243361   |  None  |
|    waterfront    |  None |  601905.594545   |  None  |
|       view       |  None |  93312.8573119   |  None  |
|    condition     |  None |  6609.03571245   |  None  |
|      grade       |  None |  6206.93999188   |  None  |
|    sqft_above    |  None |  4

In [47]:
# All 18 of them are non-zeroes

In [48]:
# In the above example, we allowed the algorithm to choose the best possible l1 value and it chose the 
# one that incorporated all features
# What of if we just want a specific number of features to be included in the model and not all features
# We can set a variable for the number of non-zero features that we want 
# And write a program that will take either all the models that contained less features than what we stipulated
# or contains more features than what we stipulated. 
# We just store these two different outcomes in 2 different lists
# Then for the one with the less than number of desired features, we return the minimal
# For the one with greater than number of desired feature, we return the maximal
# These two values now can serve as a range (Min_Max range) that we can now use to rerun our model , however,
# before we validate , we test if the number of non-zero feature in that model is equal to desired number of features
# If it is, thats when we use the model to predict and calculate error on validation set.
# Else, we ignore that model train another

In [49]:
# Ok now getting the min and max ranges for our L1 values so as to get
# a narrow range of L1 values that we can use for training

In [50]:
def finding_best_range_l1(training, validation):
    max_nonzeros = 7 # This is the number of desired features. This is user specific
    
    L1_values=np.logspace(8, 10, num=20) # Getting a list of possible L1 values
    
    
    ListofMaxL1 = []   # Initializing an empty list for Maximum L1 values
    ListofMaxCoefficient = [] # Initializing an empty list for Coefficients of Maximum  L1 values
    ListofMinL1 = []  # Initializing an empty list for Minimum L1 values
    ListofMinCoefficient = [] # Initializing an empty list for Coefficients of Minimum  L1 values
    
    for L1 in L1_values:   # Looping through the L1 values and using each to fit a model on the
                           # training data set
        model_all = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=L1, verbose = False)
        
        Coefficients=model_all['coefficients']['value']   # This will get all the values of the coefficients of current model
        Non_zero_Coefficients=model_all['coefficients']['value'].nnz() # This will get the number of models that are non zero
       
       
        if Non_zero_Coefficients > max_nonzeros:     # If that number is greater than what we desired, do this
            ListofMaxL1.append(L1)
            ListofMaxCoefficient.append(Non_zero_Coefficients)
        elif Non_zero_Coefficients < max_nonzeros:  # else if it is less than what we desired, do this
            ListofMinL1.append(L1)
            ListofMinCoefficient.append(Non_zero_Coefficients)
    l1_penalty_min = min(ListofMinL1)       # Save the appropriate min in this variable
    l1_penalty_max = max(ListofMaxL1)       # Save the appropriate max in this variable
    return l1_penalty_min, l1_penalty_max   # Return both 
    
    
      

In [51]:
l1_penalty_min,l1_penalty_max=finding_best_range_l1(training, validation)

In [52]:
l1_penalty_min

3792690190.7322536

In [53]:
l1_penalty_max

2976351441.6313133

In [54]:
# Now we can use those two numbers as range delimiters for a narrow range of L1 values

In [55]:
l1_penalty_values = np.linspace(l1_penalty_min,l1_penalty_max,20)

In [56]:
l1_penalty_values

array([  3.79269019e+09,   3.74972499e+09,   3.70675980e+09,
         3.66379460e+09,   3.62082940e+09,   3.57786420e+09,
         3.53489901e+09,   3.49193381e+09,   3.44896861e+09,
         3.40600341e+09,   3.36303822e+09,   3.32007302e+09,
         3.27710782e+09,   3.23414263e+09,   3.19117743e+09,
         3.14821223e+09,   3.10524703e+09,   3.06228184e+09,
         3.01931664e+09,   2.97635144e+09])

In [57]:
# Now all we have to do is to modify our previous findingbest_l1
# so that it basically takes these values of L1's, loops through them
# fits a model with the training data
# However, before even bothering to use the model to validate, test if the model has the appropriate number of features
# that we desire. Once that is passed, everything stays the same
# Some additions include getting the optimal L1 that has the number of desired features and using that
# to retrain our train data and then return the coefficients of that model
# Lets call this findingbest_l1_1

In [58]:
def findingbest_l1_1(training, validation):
    L1_values=np.linspace(l1_penalty_min,l1_penalty_max,20) # Getting a list of possible L1 values
    ListofRSS = []  # Initializing an empty list for computed RSS values
    ListofL1 = []   # Initializing an empty list for used L1 values
    max_nonzeros = 7
    for L1 in L1_values:   # Looping through the L1 values and using each to fit a model on the
                           # training data set
        model_all = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=L1, verbose = False)
        
        Coefficients=model_all['coefficients']['value'] # Getting coefficient of current model
        Non_zero_Coefficients=model_all['coefficients']['value'].nnz() # Getting the number that non-zero
        
        if Non_zero_Coefficients == max_nonzeros:  # Using just models with specified number of non zero features to
                                                   # validate
        
            Predictions = model_all.predict(validation) # Using the model to predict the prices
                                                    # in our validation data set
            Error = Predictions - validation['price']  # Computing the Errors
            SquaredError = Error * Error               # Getting the square of the errors
            RSS = SquaredError.sum()                   # Getting the RSS
            ListofRSS.append(RSS)                      # Compiling a list of the computed RSS
            ListofL1.append(L1)                        # List of corresponding L1 values
    IndexofminRSS= ListofRSS.index(min(ListofRSS)) # Getting the index of the min RSS in RSS list
    L1_optimal= ListofL1[IndexofminRSS]                  # Using that index to get the L1 value
    
    # Using the selected optimal L1 value to train the training data set
    OptimalModel = graphlab.linear_regression.create(training, target='price', features=all_features,
                                              validation_set=None, 
                                              l2_penalty=0., l1_penalty=L1_optimal, verbose = False)
    
    # Print all the coefficients and confirm that only 7 features here have non-zero values
    print(OptimalModel.coefficients.print_rows(num_rows=model_all.num_coefficients))
    return  L1_optimal, min(ListofRSS)                          # Returning the L1_optimal, min RSS

In [59]:
findingbest_l1_1(training, validation)

+------------------+-------+---------------+--------+
|       name       | index |     value     | stderr |
+------------------+-------+---------------+--------+
|   (intercept)    |  None | 222253.192544 |  None  |
|     bedrooms     |  None | 661.722717782 |  None  |
| bedrooms_square  |  None |      0.0      |  None  |
|    bathrooms     |  None | 15873.9572593 |  None  |
|   sqft_living    |  None | 32.4102214513 |  None  |
| sqft_living_sqrt |  None | 690.114773313 |  None  |
|     sqft_lot     |  None |      0.0      |  None  |
|  sqft_lot_sqrt   |  None |      0.0      |  None  |
|      floors      |  None |      0.0      |  None  |
|  floors_square   |  None |      0.0      |  None  |
|    waterfront    |  None |      0.0      |  None  |
|       view       |  None |      0.0      |  None  |
|    condition     |  None |      0.0      |  None  |
|      grade       |  None | 2899.42026975 |  None  |
|    sqft_above    |  None | 30.0115753022 |  None  |
|  sqft_basement   |  None |

(3448968612.1634364, 1046937488751711.1)