# 3. Model Selection

## Summary of Commands

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from mymetrics import root_mean_squared_log_error

hs = pd.read_csv('data/housing_sample.csv')
X = hs[['GrLivArea']].values
y = hs.pop('SalePrice').values

lr = LinearRegression()
lr.fit(X, y)

kf = KFold(n_splits=5, shuffle=True)
cross_val_score(lr, X, y, cv=kf, scoring=root_mean_squared_log_error)

## Hyperparameter Tuning
* Use DecisionTree
* Default hyperparameter values
* More columns

### Select new hyperparameters

## Search many hyperparameter combinations with `GridSearchCV`
* parameter grid
* meta-estimator

### Three-step process for `GridSearchCV`

### Retrieving the results
* best params, model
* dataframe of results

## Summary

In [None]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, GridSearchCV
from mymetrics import root_mean_squared_log_error

hs = pd.read_csv('data/housing_sample.csv')
X = hs[['YearBuilt', 'GrLivArea', 'GarageArea']].values
y = hs.pop('SalePrice').values

kf = KFold(n_splits=5, shuffle=True)
dtr = DecisionTreeRegressor()

grid = {'max_depth': range(2, 11), 'min_samples_split': [5, 10, 20, 50, 100]}
gs = GridSearchCV(estimator=dtr, param_grid=grid, cv=kf, scoring=root_mean_squared_log_error)
gs.fit(X, y)
gs.best_params_

In [None]:
df_results = pd.DataFrame(gs.cv_results_)
gs.best_estimator_

## Exercise
Practice using `GridSearchCV` with different combinations of hyperparameters using different regressors.