* Understand importance of cross validation
* How to perform cross validation using sklearn library
* Understand various parameters involved in cross validation

In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [2]:
data = pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
ohe = pd.get_dummies(data, drop_first=True)
y = 'charges'
x = ohe.columns.drop(y)
ohe.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [4]:
xtrain, xtest, ytrain, ytest = train_test_split(ohe[x], ohe[y], test_size=.2, random_state=1)

In [5]:
dr = DecisionTreeRegressor(max_depth=5).fit(xtrain, ytrain)
ypred_test = dr.predict(xtest)

print('RMSE: %.2f' % np.sqrt(mean_squared_error(ytest, ypred_test)))
print('R2 Score : %.2f' % r2_score(ytest, ypred_test))

RMSE: 4603.85
R2 Score : 0.86


In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(ohe[x], ohe[y], test_size=.2, random_state=3)

In [7]:
dr = DecisionTreeRegressor(max_depth=5).fit(xtrain, ytrain)
ypred_test = dr.predict(xtest)

print('RMSE: %.2f' % np.sqrt(mean_squared_error(ytest, ypred_test)))
print('R2 Score : %.2f' % r2_score(ytest, ypred_test))

RMSE: 4435.08
R2 Score : 0.87


# Cross validation - calc score

In [9]:
dr = DecisionTreeRegressor(max_depth=5)
scores = cross_val_score(dr, xtrain, ytrain, cv=5)
scores

array([0.88185716, 0.8400697 , 0.82149264, 0.84015368, 0.81233432])

* Above score is R2_score

In [10]:
avg_score = scores.mean()
sqr_std = scores.std()*2

print('Expected R Square: %.2f (+/- %.2f)' %(avg_score, sqr_std))
print('Range for R Square: %.2f - %.2f' %(avg_score - sqr_std, avg_score + sqr_std))

Expected R Square: 0.84 (+/- 0.05)
Range for R Square: 0.79 - 0.89


In [11]:
cv_results = cross_validate(dr, xtrain, ytrain, cv=5)
cv_results # returns time it took to calc the scores
# fit_time indicates how much time it takes to train on each fold in seconds
# score_time indicates how much time it takes to evaluate on each fold in seconds
# 

{'fit_time': array([0.00396395, 0.00348687, 0.00323391, 0.0028739 , 0.00301003]),
 'score_time': array([0.00230908, 0.00180912, 0.00169897, 0.00169706, 0.00152683]),
 'test_score': array([0.88224587, 0.8400697 , 0.82149264, 0.84015368, 0.81233432])}

# Display train scores also

In [12]:
cv_results = cross_validate(dr, xtrain, ytrain, cv=5, return_train_score=True)
cv_results # returns time it took to calc the scores

{'fit_time': array([0.00419784, 0.00326014, 0.00300121, 0.00301504, 0.00285006]),
 'score_time': array([0.00248027, 0.00194502, 0.00180674, 0.00153208, 0.00155711]),
 'test_score': array([0.88224587, 0.8400697 , 0.82149264, 0.84015368, 0.81233432]),
 'train_score': array([0.87668359, 0.8861341 , 0.8897761 , 0.88386519, 0.887722  ])}