In [1]:
import sys
import os

import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Data

In [2]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


## Training and testing sets

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [5]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [6]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

In [7]:
# fit linear regression model:

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(train_X_prepared, train_y)

In [8]:
# get testing RMSE:

from sklearn.metrics import mean_squared_error

test_fitted = lin_reg.predict(test_X_prepared)
test_rmse = np.sqrt(mean_squared_error(test_y, test_fitted))

test_rmse

72129.57449006115

# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.

In [9]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=10, solver="cholesky")

ridge_reg.fit(train_X_prepared,train_y)

# testing rmse:
test_fitted = ridge_reg.predict(test_X_prepared)
test_rmse = np.sqrt(mean_squared_error(test_y, test_fitted))

test_rmse

72195.3401723892

In [10]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# create a Ridge Regression model
ridge_reg_model = Ridge()

# define a range of alpha values to search through (we zoomed in until the grid search found a value)
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 150, 159, 200, 500, 1000]}

# use grid search to find the best alpha 
ridge_search = GridSearchCV(ridge_reg_model, params, cv=5)
ridge_search.fit(train_X_prepared, train_y)

# get the best alpha 
best_alpha = ridge_search.best_params_['alpha']

print(f"Best alpha value: {best_alpha}")

Best alpha value: 159


In [11]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=159, solver="cholesky")

ridge_reg.fit(train_X_prepared,train_y)

test_fitted = ridge_reg.predict(test_X_prepared)
test_rmse = np.sqrt(mean_squared_error(test_y, test_fitted))

test_rmse

73029.69640331468

# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

In [29]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=1000)
lasso_reg.fit(train_X_prepared, train_y)

test_fitted = lasso_reg.predict(test_X_prepared)
test_rmse = np.sqrt(mean_squared_error(test_y, test_fitted))

test_rmse


73037.08337454066

In [30]:
lasso_reg.coef_
# yes, the LASSO model can help select variables
# when our alpha is 1000, some of the co-efficients are driven to zero as seen below. 
# the significant variables are the ones with coefficients not equal to zero

array([-30702.12932419, -30289.65620361,  11821.78108691,   1654.25936136,
         2972.74269704, -37595.83916317,  37094.10561768,  76322.82540894,
         5080.61308451,  12635.5139211 ,      0.        ,     -0.        ,
       -45716.73979158,      0.        ,   5807.27519117])

In [14]:
from sklearn.model_selection import GridSearchCV

# create a LASSO Regression model
lasso_reg_model = Lasso()

# define a range of alpha values to search through
params = {'alpha': [0.001, 0.01, 0.1, 0.5, 1, 100, 200, 257, 300, 500, 1000]}

# use GridSearchCV to find the best alpha value
lasso_search = GridSearchCV(Lasso(), params, cv=5)
lasso_search.fit(train_X_prepared, train_y)

# get the best alpha value
best_alpha = lasso_search.best_params_['alpha']
print(f"Best alpha value: {best_alpha}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best alpha value: 257


In [15]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=257)
lasso_reg.fit(train_X_prepared, train_y)

test_fitted = lasso_reg.predict(test_X_prepared)
test_rmse = np.sqrt(mean_squared_error(test_y, test_fitted))

test_rmse

72336.72147259413

# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

In [16]:
from sklearn.linear_model import ElasticNet

elastic_reg = ElasticNet(alpha=10, l1_ratio=0.1)
elastic_reg.fit(train_X_prepared, train_y)

test_fitted = elastic_reg.predict(test_X_prepared)
test_rmse = np.sqrt(mean_squared_error(test_y, test_fitted))

test_rmse

110947.06543161202

In [17]:
from sklearn.model_selection import GridSearchCV

# create a Ridge Regression model
elastic_reg_model = ElasticNet()

# define a range of alpha values and l1 ratios to search through
params = {'alpha': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 1, 10],
              'l1_ratio': [0, 0.1, 0.5, 0.8, 1]}
    
elastic_search = GridSearchCV(elastic_reg_model, params, cv=5)
elastic_search.fit(train_X_prepared, train_y)

# get the best alpha/l1_ratio 
best_alpha = elastic_search.best_params_['alpha']
best_l1 = elastic_search.best_params_['l1_ratio']

print(f"Best alpha: {best_alpha} and Best l1 ratio: {best_l1}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best alpha: 0.1 and Best l1 ratio: 0.5


  model = cd_fast.enet_coordinate_descent(


In [18]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

elastic_reg = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_reg.fit(train_X_prepared, train_y)

test_fitted = elastic_reg.predict(test_X_prepared)
test_rmse = np.sqrt(mean_squared_error(test_y, test_fitted))

test_rmse

73229.8979235774