<a href="https://colab.research.google.com/github/Bborub/baseball/blob/main/DSC502_050323_regression_housing_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Grid Search using Ridge Regression on the KC housing dataset

## Imports


In [None]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

## Load dataset and define features and dependent variable (price)

In [None]:
df = pd.read_csv('https://storage.googleapis.com/scsu-data-science/kc_house_data.csv')

# Ridge regression requires no NaN's in dataset
df.dropna(inplace=True)

X = df.loc[:, ['bedrooms','bathrooms', 'floors', 'sqft_above', 'sqft_basement', 'sqft_lot']]
y = df.loc[:, 'price']

## Construct the ridge regression model and define search space

### Optimization score is $R^2$

In [None]:
# define model
model = Ridge()

# define evaluation
cv = RepeatedKFold(n_splits = 5, n_repeats = 3, random_state = 1)

# define search space
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
space['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
space['fit_intercept'] = [True, False]

# define search
search = GridSearchCV(model, 
                      space, 
                      scoring = 'r2',
                      n_jobs = -1, 
                      cv = cv)

## Execute search (this takes about 10 minutes)

In [None]:
result = search.fit(X, y)

### Best $R^2$ and parameter values

In [None]:
print(f'Best Score: {result.best_score_}')
print(f'Best Hyperparameters: {result.best_params_}')

Best Score: 0.50706480664131
Best Hyperparameters: {'alpha': 100, 'fit_intercept': True, 'solver': 'cholesky'}


## Rerun using negative mean absolute error as score

In [None]:
# define search
search = GridSearchCV(model, 
                      space, 
                      scoring = 'neg_mean_absolute_error',
                      n_jobs = -1, 
                      cv = cv)

result = search.fit(X, y)

print(f'Best Score: {result.best_score_}')
print(f'Best Hyperparameters: {result.best_params_}')

Best Score: -169580.66115881328
Best Hyperparameters: {'alpha': 1e-05, 'fit_intercept': True, 'solver': 'cholesky'}


### Optimizing Random Forest regressor model

In [None]:
# define model
model = RandomForestRegressor()

# define evaluation
cv = RepeatedKFold(n_splits = 5, n_repeats = 3, random_state = 1)

# define search space
param_grid = dict()
param_grid['max_depth'] = [9, 11, 13, 15]
param_grid['min_samples_split'] = [8, 10, 12]

# define search
search = GridSearchCV(model, 
                      param_grid = param_grid, 
                      scoring = 'neg_mean_absolute_error',
                      n_jobs = -1, 
                      cv = cv)

result = search.fit(X, y)

print(f'Best Score: {result.best_score_}')
print(f'Best Hyperparameters: {result.best_params_}')

Best Score: -150485.9697892562
Best Hyperparameters: {'max_depth': 11, 'min_samples_split': 12}
