In [None]:
import sys
import os

import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Data

In [None]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

# housing.head()

## Training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [None]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [None]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

In [None]:
from sklearn.linear_model import LinearRegression

model  = LinearRegression()
model.fit(train_X_prepared,train_y)

In [None]:
from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import PolynomialFeatures, StandardScaler
train_pred = model.predict(test_X_prepared)

train_mse = mean_squared_error(test_y, train_pred, squared=False)


train_mse

72129.57449006115

# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.

In [None]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=10, solver="cholesky")

ridge_reg.fit(train_X_prepared,train_y)

test_pred = ridge_reg.predict(test_X_prepared)
train_mse = mean_squared_error(test_y, test_pred,squared=False)
train_mse


72195.3401723892

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_estimator = Ridge()

params = {'alpha': [x for x in range(1,201)]}

for i in range(3,11):

    gridcv_ridge_model = GridSearchCV(Ridge(), params, cv=i,scoring='neg_root_mean_squared_error')

    gridcv_ridge_model.fit(train_X_prepared, train_y)


    print(f"{i} :", gridcv_ridge_model.best_params_['alpha'])
    print(gridcv_ridge_model.best_estimator_)

    test_pred = gridcv_ridge_model.predict(test_X_prepared)
    train_mse = mean_squared_error(test_y, test_pred,squared=False)
    print(train_mse)
  

    # 75-95

    #122,123,166,157,165,63,176,61,186

3 : 125
Ridge(alpha=125)
72861.5404759224
4 : 168
Ridge(alpha=168)
73073.72379473266
5 : 156
Ridge(alpha=156)
73014.99242886041
6 : 168
Ridge(alpha=168)
73073.72379473266
7 : 63
Ridge(alpha=63)
72532.67654606629
8 : 176
Ridge(alpha=176)
73112.78529897328
9 : 61
Ridge(alpha=61)
72521.13672345363
10 : 183
Ridge(alpha=183)
73146.93363745426


0.1 72130.19582574743
10 72195.3401723892
122 72846.4739551109
123 72851.50180248583
166 73063.94913957632
157 73019.89560523246
165 73059.06005418775
63 72532.67654606629
176 73112.78529897328
61 72521.13672345363
186 73161.5649612211
75 72600.29515074384
76 72605.81508546752
77 72611.31879394961
78 72616.80657834343
79 72622.27873764264
80 72627.73556759395
81 72633.17736062066
82 72638.60440575697
83 72644.01698859165
84 72649.41539122051
85 72654.79989220692
86 72660.17076655004
87 72665.52828565943
88 72670.87271733645
89 72676.20432576108
90 72681.52337148422
91 72686.83011142476
92 72692.12479887118
93 72697.40768348728
94 72702.67901132129
95 72707.93902481889
96 72713.18796283899
97 72718.42606067234
98 72723.65355006288
99 72728.87065923127
100 72734.07761290052
101 72739.27463232366
102 72744.46193531287
103 72749.6397362704
104 72754.80824622072
72130.19582574743


# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

In [None]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=1000)
lasso_reg.fit(train_X_prepared, train_y)


test_pred = lasso_reg.predict(test_X_prepared)
train_mse = mean_squared_error(test_y, test_pred,squared=False)
train_mse

# Yes, the lasso model can help select variables because it selects variables with non-zero coefficients and removes those with zero coefficients. 

73037.08337454066

In [None]:


params = {'alpha': [x for x in range(200,1000)]}

for i in range(3,11):

    gridcv_ridge_model = GridSearchCV(Lasso(), params, cv=i,scoring='neg_root_mean_squared_error')

    gridcv_ridge_model.fit(train_X_prepared, train_y)


    print(f"{i} :", gridcv_ridge_model.best_params_['alpha'])
    print(gridcv_ridge_model.best_estimator_)

    test_pred = gridcv_ridge_model.predict(test_X_prepared)
    train_mse = mean_squared_error(test_y, test_pred,squared=False)
    print(train_mse)
  

3 : 242
Lasso(alpha=242)
72328.36827505077
4 : 350
Lasso(alpha=350)
72393.58738970847
5 : 257
Lasso(alpha=257)
72336.72147259413
6 : 297
Lasso(alpha=297)
72360.15623711773
7 : 766
Lasso(alpha=766)
72749.20088555175
8 : 300
Lasso(alpha=300)
72361.96997717892
9 : 758
Lasso(alpha=758)
72740.59501510796
10 : 300
Lasso(alpha=300)
72361.96997717892


# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

In [None]:
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [None]:
# 1. Fit Elastic Regression model with alpha=10 and l1_ratio=0.1
elastic_net = ElasticNet(alpha=10, l1_ratio=0.1)
elastic_net.fit(train_X_prepared, train_y)

# Calculate testing RMSE
test_predictions = elastic_net.predict(test_X_prepared)
test_rmse = mean_squared_error(test_y, test_predictions, squared=False)
print("Testing RMSE for Elastic Regression (alpha=10, l1_ratio=0.1):", test_rmse)

# Check coefficients to see if LASSO helps in variable selection
print("Coefficients:", elastic_net.coef_)

In [None]:
param_grid = {'alpha': [0.1, 1, 10, 100], 'l1_ratio': [0.1, 0.5, 0.9]}
grid_search = GridSearchCV(ElasticNet(), param_grid, scoring='neg_root_mean_squared_error', cv=5)
grid_search.fit(train_X_prepared, train_y)

# Get best settings
best_alpha = grid_search.best_params_['alpha']
best_l1_ratio = grid_search.best_params_['l1_ratio']
print("Best alpha:", best_alpha)
print("Best l1_ratio:", best_l1_ratio)

# Use Elastic Net model with best settings
best_elastic_net = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)
best_elastic_net.fit(train_X_prepared, train_y)

# Calculate testing RMSE with best Elastic Net model
best_test_predictions = best_elastic_net.predict(test_X_prepared)
best_test_rmse = mean_squared_error(test_y, best_test_predictions, squared=False)
print("Testing RMSE for best Elastic Net model:", best_test_rmse)
