In [3]:
import sys
import os
 
import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Data

In [4]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

# housing.head()

## Training and testing sets

In [5]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [7]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [8]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

In [89]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



In [90]:
model = LinearRegression()
model.fit(train_X_prepared, train_y)

lintest_pred = model.predict(test_X_prepared)
MeanSE = mean_squared_error(test_y, lintest_pred)
RMSE = MeanSE**.5
RMSE

72163.38641324975

# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.

In [37]:
from sklearn.linear_model import Ridge


In [79]:
ridge_reg = Ridge(alpha=10, solver="cholesky")
ridge_reg.fit(train_X_prepared,train_y)

ridgetest_pred = ridge_reg.predict(test_X_prepared)
MeanSE = mean_squared_error(test_y, ridgetest_pred)
RMSE = MeanSE**.5
RMSE


72195.3401723892

In [59]:
from sklearn.model_selection import GridSearchCV
n = 200
a = list(range(1,n))
params = {'alpha': a}

gridcv = GridSearchCV(Ridge(), params)
gridcv.fit(train_X_prepared,train_y)
gridcv.best_params_



{'alpha': 157}

In [80]:
ridge_reg = Ridge(alpha = 157,solver = "cholesky")
ridge_reg.fit(train_X_prepared,train_y)

ridgetest_pred = ridge_reg.predict(test_X_prepared)
MeanSE = mean_squared_error(test_y, ridgetest_pred)
RMSE = MeanSE**.5
RMSE

73019.89560523246

# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

In [81]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=1000)
lasso_reg.fit(train_X_prepared,train_y)
lassotest_pred = lasso_reg.predict(test_X_prepared)
MeanSE = mean_squared_error(test_y, lassotest_pred)
RMSE = MeanSE**.5
RMSE

73037.08337454067

In [66]:
from sklearn.model_selection import GridSearchCV
a = np.linspace(start=230, stop=300, num =12)
params = {'alpha': a}

gridcv = GridSearchCV(Lasso(), params)
gridcv.fit(train_X_prepared,train_y)
gridcv.best_params_

{'alpha': 261.8181818181818}

In [82]:
lasso_reg = Lasso(alpha=262)
lasso_reg.fit(train_X_prepared,train_y)
lassotest_pred = lasso_reg.predict(test_X_prepared)
MeanSE = mean_squared_error(test_y, lassotest_pred)
RMSE = MeanSE**.5
RMSE

72339.59733542392

# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

In [95]:
from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(alpha=10, l1_ratio=0.1)
elastic_net.fit(train_X_prepared,train_y)
elastictest_pred = elastic_net.predict(test_X_prepared)
MeanSE = mean_squared_error(test_y, elastictest_pred)
RMSE = MeanSE**.5
RMSE



260244.40788606717

In [71]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

a = np.linspace(start=250, stop=300, num =12)

params = {'alpha': a,
          'l1_ratio': [.9,.95,1] }
          

gridcv_elastic_model = GridSearchCV(ElasticNet(), params)

gridcv_elastic_model.fit(train_X_prepared,train_y)

gridcv_elastic_model.best_params_

{'alpha': 259.09090909090907, 'l1_ratio': 1}

In [88]:
elastic_net = ElasticNet(alpha=259, l1_ratio=1)
elastic_net.fit(train_X_prepared,train_y)
elastictest_pred = elastic_net.predict(test_X_prepared)
MeanSE = mean_squared_error(test_y, elastictest_pred)
RMSE = MeanSE**.5
RMSE

72337.89453189308