In [72]:
import sys
import os

import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data

In [73]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

# housing.head()

## Training and testing sets

In [74]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [75]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [76]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [77]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

In [78]:
# 1. fit linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(train_X_prepared, train_y)

In [79]:
# 2. calculate testing rmse
from sklearn.metrics import mean_squared_error
lin_pred_y = lin_reg.predict(test_X_prepared)
lin_test_rmse = mean_squared_error(test_y, lin_pred_y, squared=False) # squared = false to get rmse
lin_test_rmse

72129.57449006113

# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.

In [80]:
# 1. fit ridge and calculated rmse
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

ridge_reg = Ridge(alpha=10) # assume auto as not specified
ridge_reg.fit(train_X_prepared,train_y)

ridge_pred_y = ridge_reg.predict(test_X_prepared)
ridge_test_rmse = mean_squared_error(test_y, ridge_pred_y, squared=False)
ridge_test_rmse

72195.34017238923

In [81]:
# 2. grid search
from sklearn.model_selection import GridSearchCV

# initial parameters
ridge_params = {'alpha': list(range(0,12,2)) + list(range(50,1000, 50))} 

gridge_model = GridSearchCV(Ridge(), ridge_params, cv=5)

gridge_model.fit(train_X_prepared, train_y) # grid search
gridge_pred_y = gridge_model.predict(test_X_prepared)
gridge_test_rmse = mean_squared_error(test_y, gridge_pred_y, squared=False)

print(gridge_test_rmse)
print(gridge_model.best_params_)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


72985.52915710278
{'alpha': 150}


In [82]:
# 2. grid search - zoom in
from sklearn.model_selection import GridSearchCV

# initial parameters
ridge_params = {'alpha': list(range(125,176, 1))} 

gridge_model = GridSearchCV(Ridge(), ridge_params, cv=5)

gridge_model.fit(train_X_prepared, train_y) # grid search
gridge_pred_y = gridge_model.predict(test_X_prepared)
gridge_test_rmse = mean_squared_error(test_y, gridge_pred_y, squared=False)

print(gridge_test_rmse)
print(gridge_model.best_params_)

73019.89560523248
{'alpha': 157}


# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

In [83]:
# 1. fit lasso and calculate rmse
# import lasso
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# fit and calculate testing rmse
lasso_reg = Lasso(alpha=1000)
lasso_reg.fit(train_X_prepared, train_y) 
lasso_pred_y = lasso_reg.predict(test_X_prepared)
lasso_rmse = mean_squared_error(test_y, lasso_pred_y, squared=False)
print(lasso_rmse)

# print out the coefficients - should have zeroes
coeffs = lasso_reg.coef_  
coeffs

73037.08337454066


array([-30702.12932419, -30289.65620361,  11821.78108691,   1654.25936136,
         2972.74269704, -37595.83916317,  37094.10561768,  76322.82540894,
         5080.61308451,  12635.5139211 ,      0.        ,     -0.        ,
       -45716.73979158,      0.        ,   5807.27519117])

In [90]:
# 2. grid search
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

lasso_params = {'alpha': list(np.array(range(0,10))/10)+ list(range(1,12,2)) + list(range(50,1000, 50))}
glasso_model = GridSearchCV(Lasso(), lasso_params, cv=5)

glasso_model.fit(train_X_prepared, train_y)
glasso_pred_y = glasso_model.predict(test_X_prepared)
glasso_mse = mean_squared_error(test_y, glasso_pred_y, squared=False)
print(glasso_mse)
print(glasso_model.best_params_)

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordi

72361.96997717893
{'alpha': 300}


In [69]:
# 2. grid search - zoom in
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

lasso_params = {'alpha':  list(range(250,351, 1))}
glasso_model = GridSearchCV(Lasso(), lasso_params, cv=5)

glasso_model.fit(train_X_prepared, train_y)
glasso_pred_y = glasso_model.predict(test_X_prepared)
glasso_mse = mean_squared_error(test_y, glasso_pred_y, squared=False)
print(glasso_mse)
print(glasso_model.best_params_)

72336.72147259413
{'alpha': 257}


# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

In [85]:
# 1. fit elastic net and calculated rmse
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

elastic_net = ElasticNet(alpha=10, l1_ratio=0.1)
elastic_net.fit(train_X_prepared, train_y)

elastic_pred = elastic_net.predict(test_X_prepared)
elastic_rmse = mean_squared_error(test_y, elastic_pred, squared=False)

elastic_rmse

110947.06543161202

In [89]:
# 2. grid search
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

elastic_params = {'alpha': list(np.array(range(0,10))/10) + list(range(1,11,1)) + list(range(50,1000, 50)), 
                'l1_ratio': np.array(range(0,10))/10 }
print(elastic_params)
gelastic_model = GridSearchCV(ElasticNet(), elastic_params, cv=5, scoring='neg_root_mean_squared_error')

gelastic_model.fit(train_X_prepared, train_y)
gelastic_pred_y = gelastic_model.predict(test_X_prepared)
gelastic_mse = mean_squared_error(test_y, gelastic_pred_y, squared=False)
print(gelastic_mse)
print(gelastic_model.best_params_)

{'alpha': [0, 0.1, 0.5, 0.75, 1, 3, 5, 7, 9, 11, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950], 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])}


  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_f

73230.21667124746
{'alpha': 0.5, 'l1_ratio': 0.9}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [35]:


# zoom in 

elastic_params = {'alpha': [0.05,0.06, 0.1,0.11, 0.12, 0.13, 0.14, 0.15], 
                'l1_ratio' : [0.3, 0.4, 0.5, 0.6, 0.7]}

gelastic_model = GridSearchCV(ElasticNet(), elastic_params, cv=5, scoring='neg_root_mean_squared_error')

gelastic_model.fit(train_X_prepared, train_y)
gelastic_pred_y = gelastic_model.predict(test_X_prepared)
gelastic_mse = mean_squared_error(test_y, gelastic_pred_y, squared=False)
print(gelastic_mse)
print(gelastic_model.best_params_)

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_f

73229.8979235774
{'alpha': 0.1, 'l1_ratio': 0.5}
73190.88232489413
{'alpha': 0.12, 'l1_ratio': 0.6}
