In [322]:
import sys
import os

import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data

In [323]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

# housing.head()

## Training and testing sets

In [324]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [325]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [326]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [327]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

# Ans 1. Fitting Linear Regression Model

In [328]:
lin_reg = LinearRegression()
lin_reg.fit(train_X_prepared, train_y)

# Ans 2. Calculating Testing RMSE

In [329]:
test_predictions = lin_reg.predict(test_X_prepared)

test_rmse = np.sqrt(mean_squared_error(test_y, test_predictions))
print("Testing RMSE:", round(test_rmse, 2))

Testing RMSE: 72129.57


# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.

# Ans 1. Fitting Ridge Regression Model and Calculating Testing RMSE

In [330]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=10)
ridge_reg.fit(train_X_prepared, train_y)

test_predictions_ridge = ridge_reg.predict(test_X_prepared)

test_rmse_ridge = np.sqrt(mean_squared_error(test_y, test_predictions_ridge))
print("Testing RMSE for Ridge Regression with alpha = 10:", round(test_rmse_ridge, 2))

Testing RMSE for Ridge Regression with alpha = 10: 72195.34


# Ans 1 (a). I decided to conduct a broad search first to determine the range where the best alpha lies. I chose to test alpha values from 0 to 1000 in increments of 5.

In [331]:
from sklearn.model_selection import GridSearchCV

# Range of alpha values
alpha_range_ridge_broad = np.arange(0, 1000, 5)

# Parameter grid
params_ridge_broad = {'alpha': alpha_range_ridge_broad}

# Ridge estimator
ridge_estimator = Ridge()

# Grid search
gridcv_ridge_model_broad = GridSearchCV(ridge_estimator, params_ridge_broad, cv=5)

# Fit the grid search model
gridcv_ridge_model_broad.fit(train_X_prepared, train_y)

best_alpha_ridge_broad = gridcv_ridge_model_broad.best_params_['alpha']

# Round the best alpha value to 2 decimal places
best_alpha_ridge_broad_rounded = round(best_alpha_ridge_broad, 2)

# Printthe best alpha value
print("Best alpha value:", best_alpha_ridge_broad_rounded)


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Best alpha value: 155


# Ans 1 (b). "After narrowing down the best alpha value to around 155, I conducted another grid search to test alpha values from 150 to 160 in increments of 0.1 to find the most accurate alpha value possible."

In [332]:
# Narrower range of alpha values
alpha_range_ridge_narrow = np.arange(150, 160, 0.1)

# Parameter grid
params_ridge_narrow = {'alpha': alpha_range_ridge_narrow}

# Ridge estimator
ridge_estimator_narrow = Ridge()

# Grid search
gridcv_ridge_model_narrow = GridSearchCV(ridge_estimator_narrow, params_ridge_narrow, cv=5)

# Fit the grid search model
gridcv_ridge_model_narrow.fit(train_X_prepared, train_y)

best_alpha_ridge_narrow = gridcv_ridge_model_narrow.best_params_['alpha']

# Round the best alpha value to 2 decimal places
best_alpha_ridge_narrow_rounded = round(best_alpha_ridge_narrow, 2)

# Print the best alpha value
print("Best alpha value:", best_alpha_ridge_narrow_rounded)

# Predict using the best model
test_predictions_ridge_narrow = gridcv_ridge_model_narrow.predict(test_X_prepared)

# Calculate RMSE
test_rmse_ridge_narrow = np.sqrt(mean_squared_error(test_y, test_predictions_ridge_narrow))

# Print RMSE
print("Testing RMSE for Ridge Regression:", round(test_rmse_ridge_narrow, 2))

Best alpha value: 157.2
Testing RMSE for Ridge Regression: 73020.88


# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

# Ans 1(a). Fitting LASSO Regression Model with $\alpha=1000$, and Calculating RMSE
## Ans 1(b). Yes, the LASSO model can help narrow down variables as it acts like a filter that reduces noise (reducing the absolute size of coefficients) and helps focus on the most influential variables for making predictions.

In [333]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=1000)
lasso_reg.fit(train_X_prepared, train_y)

# Predict
test_predictions_lasso = lasso_reg.predict(test_X_prepared)

# Calculate RMSE
test_rmse_lasso = np.sqrt(mean_squared_error(test_y, test_predictions_lasso))
print("Testing RMSE for LASSO Regression with alpha = 1000:", round(test_rmse_lasso,2))

Testing RMSE for LASSO Regression with alpha = 1000: 73037.08


# Ans 2(a). Broad search

In [334]:
# Range of alpha values
alpha_range_lasso_broad = np.arange(0, 1000, 5)

# Parameter grid
params_lasso_broad = {'alpha': alpha_range_lasso_broad}

# Lasso estimator
lasso_estimator = Lasso()

# Grid search
gridcv_lasso_model_broad = GridSearchCV(lasso_estimator, params_lasso_broad, cv=5)

# Fit the grid search model
gridcv_lasso_model_broad.fit(train_X_prepared, train_y)

best_alpha_lasso_broad = gridcv_lasso_model_broad.best_params_['alpha']

# Print the best alpha value
print("Best alpha value:", round(best_alpha_lasso_broad,2))

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best alpha value: 260


# Ans 2(b). Narrow search

In [335]:
# Narrower range of alpha values
alpha_range_lasso_narrow = np.arange(250, 270, 0.1)

# Parameter grid
params_lasso_narrow = {'alpha': alpha_range_lasso_narrow}

# Lasso estimator
lasso_estimator_narrow = Lasso()

# Grid search
gridcv_lasso_model_narrow = GridSearchCV(lasso_estimator_narrow, params_lasso_narrow, cv=5)

# Fit the grid search model
gridcv_lasso_model_narrow.fit(train_X_prepared, train_y)

best_alpha_lasso_narrow = gridcv_lasso_model_narrow.best_params_['alpha']

# Print the best alpha value
print("Best alpha value:", round(best_alpha_lasso_narrow, 2))

# Predict using the best model
test_predictions_lasso_narrow = gridcv_lasso_model_narrow.predict(test_X_prepared)

# Calculate RMSE
test_rmse_lasso_narrow = np.sqrt(mean_squared_error(test_y, test_predictions_lasso_narrow))

# Print RMSE
print("Testing RMSE for Lasso Regression:", round(test_rmse_lasso_narrow, 2))

Best alpha value: 256.1
Testing RMSE for Lasso Regression: 72336.21


# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

# Ans 1 (a). Fitting Elastic Regression Model
## Ans 2 (b). The elastic regression model operates similarly to the LASSO model in terms of variable selection. Both limit the impact of coefficients. Therefore, it can be used to select variables.

In [336]:
from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(alpha=10, l1_ratio=0.1)
elastic_net.fit(train_X_prepared, train_y)

# Predictions
test_predictions_elastic_net = elastic_net.predict(test_X_prepared)

# Calculate RMSE
test_rmse_elastic_net = np.sqrt(mean_squared_error(test_y, test_predictions_elastic_net))

print("Testing RMSE for Elastic Net Regression:", round(test_rmse_elastic_net,2))

Testing RMSE for Elastic Net Regression: 110947.07


In [337]:
alpha_range_en_broad = np.arange(0, 10, 1)
l1_ratio_range_en_broad = np.arange(0,1,0.1)

params_en_broad = {'alpha': alpha_range_en_broad, 'l1_ratio': l1_ratio_range_en_broad}

# ElasticNet estimator
elastic_net_estimator_en_broad = ElasticNet()

# Perform grid search
gridcv_elastic_net_model_en_broad = GridSearchCV(elastic_net_estimator_en_broad, params_en_broad, cv=5)

# Fit the grid search model
gridcv_elastic_net_model_en_broad.fit(train_X_prepared, train_y)

# Best parameters
best_params_en_broad = gridcv_elastic_net_model_en_broad.best_params_

best_alpha_rounded_broad = round(best_params_en_broad['alpha'], 2)
best_l1_ratio_rounded_broad = round(best_params_en_broad['l1_ratio'], 2)

# Print the best parameters
print("Best parameters: {'alpha':", best_alpha_rounded_broad, ", 'l1_ratio':", best_l1_ratio_rounded_broad, "}")

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_f

Best parameters: {'alpha': 1 , 'l1_ratio': 0.9 }


  model = cd_fast.enet_coordinate_descent(


In [338]:
alpha_range_en = np.arange(0.2, 1, 0.1)
l1_ratio_range_en = np.arange(0.8, 1, 0.01)

params_en = {'alpha': alpha_range_en, 'l1_ratio': l1_ratio_range_en}

# ElasticNet estimator
elastic_net_estimator_en = ElasticNet()

# Grid search
gridcv_elastic_net_model_en = GridSearchCV(elastic_net_estimator_en, params_en, cv=5)

# Fit the grid search model
gridcv_elastic_net_model_en.fit(train_X_prepared, train_y)

# Best parameters
best_params_en = gridcv_elastic_net_model_en.best_params_

best_alpha_rounded = round(best_params_en['alpha'], 2)
best_l1_ratio_rounded = round(best_params_en['l1_ratio'], 2)

# Print the best parameters
print("Best parameters: {'alpha':", best_alpha_rounded, ", 'l1_ratio':", best_l1_ratio_rounded, "}")

# Predict using the best model
test_predictions_en = gridcv_elastic_net_model_en.predict(test_X_prepared)

# Calculate RMSE
test_rmse_en = np.sqrt(mean_squared_error(test_y, test_predictions_en))

# Print RMSE
print("Testing RMSE for Elastic Net Regression:", round(test_rmse_en, 2))

Best parameters: {'alpha': 0.7 , 'l1_ratio': 0.93 }
Testing RMSE for Elastic Net Regression: 73210.86
