In [172]:
import sys
import os

import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data

In [173]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


## Training and testing sets

In [174]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [176]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [177]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [178]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

In [179]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

linreg.fit(train_X_prepared,train_y)


ypred = linreg.predict(test_X_prepared)



In [183]:
from sklearn import metrics

print('test rmse is ',metrics.mean_squared_error(test_y,ypred,squared=False))

test rmse is  72129.57449006115


Above is the printed rmse from using regular regression model as the instructions asked. 

# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.


In [171]:
#task 2 part 1 
from sklearn.linear_model import Ridge
from sklearn import metrics


ridge_reg = Ridge(alpha=10, solver="auto")
ridge_reg.fit(train_X_prepared,train_y)


ypredridge = ridge_reg.predict(test_X_prepared)

print('coef are',ridge_reg.coef_)

print('test mse is ',metrics.mean_squared_error(test_y,ypredridge,squared=False))

coef are [-51021.0584814  -51342.28114018  12529.81108502  10373.05663217
  -6000.43262837 -50369.76902366  50295.04765218  75466.48197152
   8479.31032996  16562.39342402   2421.95039148   8026.69436144
 -28741.94149235   3863.74040758  16851.50672341]
test mse is  72195.3401723892


After using grid search and multiple attempts of different parameter for cv and alpha it appears that adding any value for alpha for ridge will decrease the test mse compared to regular regression model without any regularization. 

In [181]:
#task 2 part 2 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

#first used larger list then narrowed down . for some reason the larger alpha i put the model is selecting the larger alpha values even if it causes increase in rmse. linear regression withou regularization seems to work better then with ridge or any penalty added. 
#seems like adding any number for alpha will decrease rmse compared to regular regression model. 
#tried higher values for alpha but higher alpha seems to decrease rmse grid search with cv  is always picking the highest alpha value even if it causes increase in rmse. i believe this is due to adding a ridge penalty to regression using this data is not the appropriate method. 

paramsa = {'alpha': [0.5,0.1,0.4,1,2]}



gridcv_ridge_model = GridSearchCV(Ridge(solver="auto"), paramsa, cv=10)

gridcv_ridge_model.fit(train_X_prepared, train_y)


best_ridge_model = gridcv_ridge_model.best_estimator_


ypredridgegrid = best_ridge_model.predict(test_X_prepared)



print('Test Mse is ',metrics.mean_squared_error(test_y,ypredridgegrid,squared=False))



print(gridcv_ridge_model.best_params_)
gridcv_ridge_model.best_estimator_

print('Coef',best_ridge_model.coef_)


Test Mse is  72142.19992651166
{'alpha': 2}
Coef [-53774.4846853  -54287.56031675  12492.2994815   11123.65136433
  -9862.06947465 -51995.60132913  54911.19856456  75347.13761821
   9184.72089449  17113.62014983   2650.0360935    7814.02402624
 -27353.83428224   3246.02622329  16293.78403305]


# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

In [168]:
#task 3 part 1 
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=1000)
lasso_reg.fit(train_X_prepared, train_y)


ypredlasso = lasso_reg.predict(test_X_prepared)



print('Test mse for Lasso with alpha = 1000 is ',metrics.mean_squared_error(test_y,ypredlasso,squared=False))

print('coef for lasso model',lasso_reg.coef_)

#Lasso can help select variables we see that some of the variables were forced to 0 

Test mse for Lasso with alpha = 1000 is  73037.08337454066
coef for lasso model [-30702.12932419 -30289.65620361  11821.78108691   1654.25936136
   2972.74269704 -37595.83916317  37094.10561768  76322.82540894
   5080.61308451  12635.5139211       0.             -0.
 -45716.73979158      0.           5807.27519117]


In [157]:
#task 3 part 2 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn import metrics


paramsz = {'alpha': [900,1000,1100,1200,1300,1400,800,700,600,500,400,300,200,290,310,301,209,300.1,299.9,299.8,299.7,299.6,299.5,299.4,1,10,100,200,300]}
#used large range of alpha then narrowed down to near these numbers



gridcv_lasso_model = GridSearchCV(Lasso(), paramsz, cv=10, scoring='neg_root_mean_squared_error')


gridcv_lasso_model.fit(train_X_prepared, train_y)


best_lasso_model = gridcv_lasso_model.best_estimator_


ypredlassogrid = best_lasso_model.predict(test_X_prepared)


print(metrics.mean_squared_error(test_y, ypredlassogrid, squared=False))
print(gridcv_lasso_model.best_params_['alpha'])

best_lasso_model.coef_


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


72361.6670797703
299.5


array([-46962.77853726, -47642.21096033,  12093.67660402,   5793.48166443,
            0.        , -47036.44563031,  45402.04004868,  75958.23366642,
         7410.45172125,  14968.47559843,   1673.1765204 ,      0.        ,
       -37424.8406446 ,     -0.        ,   8294.86455954])

From lasso we can see that some of the coef were forced to become zero but this did not increase our test rmse compared to base regressino model. Helps with feature selection but is not resulting in higher test mse most likely as we need more data. 

# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the Elastic net model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

In [182]:
#task 4 part 1 
from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(alpha=10, l1_ratio=0.1)
elastic_net.fit(train_X_prepared, train_y)

ypredelastic = elastic_net.predict(test_X_prepared)



print('test rmse is ',metrics.mean_squared_error(test_y,ypredelastic,squared=False))

test rmse is  110947.06543161202


In [170]:
#task 4 part 2 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

paramss = {'alpha': [0,0.1,0.9,0.8,0.7,0.6,0.09,0.11],
          'l1_ratio': [0, 0.1, 0.5,0.4,0.3,0.6,0.51,0.49,0.52] }
          #tried large range of values for alpha and l1 ratio but narrowed down to this 

gridcv_elastic_model = GridSearchCV(ElasticNet(), paramss, cv=5)

gridcv_elastic_model.fit(train_X_prepared, train_y)

ypredelasticgrid = gridcv_elastic_model.predict(test_X_prepared)



print('test mse is ',metrics.mean_squared_error(test_y,ypredelasticgrid,squared=False))

print('best parameter is ',gridcv_elastic_model.best_params_)

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_f

test mse is  73210.37910303682
best parameter is  {'alpha': 0.1, 'l1_ratio': 0.51}


Even with elastic net we can see that the test mse is not higher then the base regression model. I have tried many ranges for grid search parameters and different lengths of cv but elastics net, ridge, and lasso all result in higher test rmse compared to base regression model. 