In [1]:
import sys
import os

import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data

In [2]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

# housing.head()

## Training and testing sets

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [5]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [8]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


pandas.core.frame.DataFrame

# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

In [9]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(train_X_prepared, train_y)

In [13]:
from sklearn.metrics import mean_squared_error

test_pred = lin_reg.predict(test_X_prepared)
test_mse = mean_squared_error(test_y, test_pred)
test_rmse = np.sqrt(test_mse)

test_rmse

72129.57449006113

# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.

In [14]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_reg = Ridge()
param_grid = {'alpha': [0.1, 1, 10, 100]} 

In [15]:
grid_search = GridSearchCV(ridge_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_X_prepared, train_y)

best_alpha = grid_search.best_params_['alpha']

ridge_reg_best_alpha = Ridge(alpha=best_alpha)
ridge_reg_best_alpha.fit(train_X_prepared, train_y)

In [16]:
test_pred_ridge = ridge_reg_best_alpha.predict(test_X_prepared)
test_mse_ridge = mean_squared_error(test_y, test_pred_ridge)
test_rmse_ridge = np.sqrt(test_mse_ridge)

best_alpha, test_rmse_ridge

(100, 72734.07761290054)

# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

In [17]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=1000)
lasso_reg.fit(train_X_prepared, train_y)

In [18]:
test_pred_lasso = lasso_reg.predict(test_X_prepared)
test_mse_lasso = mean_squared_error(test_y, test_pred_lasso)
test_rmse_lasso = np.sqrt(test_mse_lasso)
print("Testing RMSE for LASSO (alpha=1000):", test_rmse_lasso)

Testing RMSE for LASSO (alpha=1000): 73037.08337454066


In [19]:
param_grid_lasso = {'alpha': [0.1, 1, 10, 100, 1000]}  

lasso_reg_grid = Lasso()

grid_search_lasso = GridSearchCV(lasso_reg_grid, param_grid_lasso, cv=5, scoring='neg_mean_squared_error')
grid_search_lasso.fit(train_X_prepared, train_y)

best_alpha_lasso = grid_search_lasso.best_params_['alpha']

lasso_reg_best_alpha = Lasso(alpha=best_alpha_lasso)
lasso_reg_best_alpha.fit(train_X_prepared, train_y)

test_pred_lasso_best_alpha = lasso_reg_best_alpha.predict(test_X_prepared)
test_mse_lasso_best_alpha = mean_squared_error(test_y, test_pred_lasso_best_alpha)
test_rmse_lasso_best_alpha = np.sqrt(test_mse_lasso_best_alpha)

best_alpha_lasso, test_rmse_lasso_best_alpha

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


(1000, 73037.08337454066)

# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

In [21]:
from sklearn.linear_model import ElasticNet

elastic_net_reg = ElasticNet(alpha=10, l1_ratio=0.1)
elastic_net_reg.fit(train_X_prepared, train_y)

test_pred_elastic_net = elastic_net_reg.predict(test_X_prepared)
test_mse_elastic_net = mean_squared_error(test_y, test_pred_elastic_net)
test_rmse_elastic_net = np.sqrt(test_mse_elastic_net)

print("Testing RMSE for Elastic Net (alpha=10, l1_ratio=0.1):", test_rmse_elastic_net)

Testing RMSE for Elastic Net (alpha=10, l1_ratio=0.1): 110947.06543161202


In [22]:
param_grid_elastic_net = {'alpha': [0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9]}  # 可以根據需要擴展超參數範圍

elastic_net_reg_grid = ElasticNet()

grid_search_elastic_net = GridSearchCV(elastic_net_reg_grid, param_grid_elastic_net, cv=5, scoring='neg_mean_squared_error')
grid_search_elastic_net.fit(train_X_prepared, train_y)

best_alpha_elastic_net = grid_search_elastic_net.best_params_['alpha']
best_l1_ratio_elastic_net = grid_search_elastic_net.best_params_['l1_ratio']

elastic_net_reg_best_params = ElasticNet(alpha=best_alpha_elastic_net, l1_ratio=best_l1_ratio_elastic_net)
elastic_net_reg_best_params.fit(train_X_prepared, train_y)

test_pred_elastic_net_best_params = elastic_net_reg_best_params.predict(test_X_prepared)
test_mse_elastic_net_best_params = mean_squared_error(test_y, test_pred_elastic_net_best_params)
test_rmse_elastic_net_best_params = np.sqrt(test_mse_elastic_net_best_params)

best_alpha_elastic_net, best_l1_ratio_elastic_net, test_rmse_elastic_net_best_params

(0.1, 0.5, 73229.8979235774)