In [14]:
import sys
import os

import sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data

In [15]:
housing = pd.read_csv("housing.csv")

housing = housing.iloc[:5000,:] # only use the first 5000 observations

### Attribute Combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

# housing.head()

## Training and testing sets

In [16]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Preprocessing

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)



## Training X and y variables

In [18]:
train_X = train_set.drop("median_house_value", axis=1)
train_y = train_set["median_house_value"].copy()

train_X_prepared = preprocessing.fit_transform(train_X)

## Testing X and y variables

In [19]:
test_X = test_set.drop("median_house_value", axis=1)
test_y = test_set["median_house_value"].copy()

# transform X in testing set
test_X_prepared = preprocessing.transform(test_X)


# Linear Regression Model

**Task 1:** 

1. Fit a Linear Regression model using `train_X_prepared` as predictor, and `train_y` as response.
2. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.

In [20]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()



In [21]:
from sklearn.metrics import mean_squared_error

model.fit(train_X_prepared, train_y)
    
#RMSE

y_val_pred = model.predict(test_X_prepared)
val_mse = mean_squared_error(y_val_pred, test_y)            

In [22]:
print(np.sqrt(val_mse))

72129.57449006113


# Ridge

**Task 2:**
1. Fit a Ridge Regression model with tuning parameter $\alpha=10$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE using `test_X_prepared` as predictor, and `test_y` as response.
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Then use the Ridge model with the best alpha value to calculate the testing RMSE.

In [23]:
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [33]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_reg = Ridge(alpha=10, solver="cholesky")
ridge_reg.fit(train_X_prepared,train_y)

y_val_pred = ridge_reg.predict(test_X_prepared)
val_mse = mean_squared_error(y_val_pred, test_y)  
print(np.sqrt(val_mse))

72195.3401723892


In [72]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

ridge = Ridge()

params = {'alpha': np.arange(157.17,157.18,0.001)}

gridcv_ridge_model = GridSearchCV(ridge, params, cv=5)

gridcv_ridge_model.fit(train_X_prepared, train_y)


gridcv_ridge_model.best_params_
gridcv_ridge_model.best_estimator_




In [95]:
y_val_pred = gridcv_ridge_model.best_estimator_.predict(test_X_prepared)
val_mse = mean_squared_error(y_val_pred, test_y)  
print(np.sqrt(val_mse))

73037.08337454066


# LASSO

**Task 3:**
1. Fit a LASSO Regression model with tuning parameter $\alpha=1000$ using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best $\alpha$ value (You may need multiple grid searches). Use the LASSO model with the best $alpha$ value to calculate the testing RMSE.

In [62]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=1000)
lasso_reg.fit(train_X_prepared, train_y)


y_val_pred = lasso_reg.predict(test_X_prepared)
val_mse = mean_squared_error(y_val_pred, test_y)  
print(np.sqrt(val_mse))

73037.08337454066


In [88]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

lasso = Lasso()

params = {'alpha': np.arange(200,300, 1)}

gridcv_lasso_model = GridSearchCV(lasso, params, cv=5)

gridcv_lasso_model.fit(train_X_prepared, train_y)


gridcv_lasso_model.best_params_
gridcv_lasso_model.best_estimator_




In [94]:
y_val_pred = gridcv_lasso_model.best_estimator_.predict(test_X_prepared)
val_mse = mean_squared_error(y_val_pred, test_y)  
print(np.sqrt(val_mse))

72336.72147259413


# Elastic Net

**Task 4:**
1. Fit an Elastic Regression model with `alpha=10` and `l1_ratio=0.1` using `train_X_prepared` as predictor, and `train_y` as response. Then, calculate the testing RMSE. Can the LASSO model help select variables?
2. Use grid search to choose the best settings for $alpha$ and `l1_ratio` value (You may need multiple grid searches). Use the elastic net model with the best settings to calculate the testing RMSE.

In [100]:
params = {'alpha': [10],
          'l1_ratio': [ 0.1] }
        
gridcv_elastic_model = GridSearchCV(ElasticNet(), params, cv=5)

gridcv_elastic_model.fit(train_X_prepared, train_y)

y_val_pred = gridcv_elastic_model.predict(test_X_prepared)
val_mse = mean_squared_error(y_val_pred, test_y)  
print(np.sqrt(val_mse))

110947.06543161202


In [101]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
params = {'alpha': np.arange(200,300, 1),
          'l1_ratio': np.arange(0,1,0.1) }
        
gridcv_elastic_model = GridSearchCV(ElasticNet(), params, cv=5)

gridcv_elastic_model.fit(train_X_prepared, train_y)

gridcv_elastic_model.best_params_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'alpha': 200, 'l1_ratio': 0.9}

In [103]:
gridcv_elastic_model.best_estimator_.fit(train_X_prepared, train_y)

In [106]:
y_val_pred = gridcv_elastic_model.best_estimator_.predict(test_X_prepared)
val_mse = mean_squared_error(y_val_pred, test_y)  
print(np.sqrt(val_mse))

114620.86233501969
