# Cross Validation

In [None]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [None]:
df = pd.read_csv('./used_cars.csv.gz').set_index('Id')
df.head()

In [None]:
train, test = train_test_split(df)

In [None]:
X, y = train[['Year', 'Mileage']], train.Price

## Basic Cross-Validation

- gives us the score for every split

In [None]:
lr = LinearRegression()
cross_val_score(lr, X, y, cv=4) # r^2 by default

In [None]:
tree = DecisionTreeRegressor(max_depth=4)
cross_val_score(tree, X, y, cv=4).mean()

In [None]:
tree = DecisionTreeRegressor(max_depth=2)
cross_val_score(tree, X, y, cv=4).mean()

In [None]:
# to use mean squared error instead...
from sklearn.metrics import mean_squared_error, make_scorer

cross_val_score(lr, X, y, cv=4, scoring=make_scorer(mean_squared_error))

## Grid Search

- an algorithm
- a "grid" of params to search through

In [None]:
tree = DecisionTreeRegressor()
params = {
    'max_depth': range(1, 6),
    'splitter': ['best', 'random']
}

In [None]:
grid = GridSearchCV(tree, params, cv=4)
# grid = GridSearchCV(tree, params, cv=4, scoring=make_scorer(mean_squared_error))
grid.fit(X, y)

In [None]:
results = grid.cv_results_

In [None]:
for params, score in zip(results['params'], results['mean_test_score']):
    params['score'] = score
    
pd.DataFrame(results['params'])