In [4]:
import sys
import os

import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data

In [5]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

# housing.head()

## Training and testing sets

In [6]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [8]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [9]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.

# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

In [10]:
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [11]:
# 1. Fit Elastic Regression model with alpha=10 and l1_ratio=0.1
elastic_net = ElasticNet(alpha=10, l1_ratio=0.1)
elastic_net.fit(train_X_prepared, train_y)

# Calculate testing RMSE
test_predictions = elastic_net.predict(test_X_prepared)
test_rmse = mean_squared_error(test_y, test_predictions, squared=False)
print("Testing RMSE for Elastic Regression (alpha=10, l1_ratio=0.1):", test_rmse)

# Check coefficients to see if LASSO helps in variable selection
print("Coefficients:", elastic_net.coef_)

Testing RMSE for Elastic Regression (alpha=10, l1_ratio=0.1): 110947.06543161202
Coefficients: [ -807.74294326 -1431.75759049  1315.93686909  1243.2773355
   347.25345708  -555.86136191   588.43545429  7473.96744829
  1797.74205621 -2578.10771589  -151.08733463  1338.40959097
 -2588.77731299   644.15304073   605.99246111]


In [19]:
param_grid = {'alpha': [0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9]}
grid_search = GridSearchCV(ElasticNet(), param_grid, scoring='neg_root_mean_squared_error', cv=5)
grid_search.fit(train_X_prepared, train_y)

# Get best settings
best_alpha = grid_search.best_params_['alpha']
best_l1_ratio = grid_search.best_params_['l1_ratio']
print("Best alpha:", best_alpha)
print("Best l1_ratio:", best_l1_ratio)

# Use Elastic Net model with best settings
best_elastic_net = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)
best_elastic_net.fit(train_X_prepared, train_y)

# Calculate testing RMSE with best Elastic Net model
best_test_predictions = best_elastic_net.predict(test_X_prepared)
best_test_rmse = mean_squared_error(test_y, best_test_predictions, squared=False)
print("Testing RMSE for best Elastic Net model:", best_test_rmse)


Best alpha: 0.1
Best l1_ratio: 0.5
Testing RMSE for best Elastic Net model: 73229.8979235774
