In [1]:
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize ,
poly)
from sklearn.model_selection import train_test_split

In [2]:
from functools import partial
from sklearn.model_selection import \
(cross_validate ,
KFold ,
ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

### Validation Set Approach

In [3]:
### We want to estimate the test error rates that results from fitting various linear models on the auto data set
Auto = load_data('Auto')
Auto_train , Auto_valid = train_test_split(Auto , test_size=196, random_state=0)

In [4]:
# we fit the linear regression using just the training dataset
hp_mm = MS(['horsepower'])
X_train = hp_mm.fit_transform(Auto_train)
y_train = Auto_train['mpg']
model = sm.OLS(y_train , X_train)
results = model.fit()

In [5]:
X_valid = hp_mm.transform(Auto_valid)
y_valid = Auto_valid['mpg']
valid_pred = results.predict(X_valid)
# MSE of the model
np.mean((y_valid - valid_pred)**2)

23.61661706966988

In [6]:
### We can also estimate the validation error for higher degree polynomial regressions by creating the following function
def evalMSE(terms ,response ,train , test):
    mm = MS(terms)
    X_train = mm.fit_transform(train)
    y_train = train[response]
    X_test = mm.transform(test)
    y_test = test[response]
    results = sm.OLS(y_train , X_train).fit()
    test_pred = results.predict(X_test)
    return np.mean((y_test - test_pred)**2)

In [7]:
MSE = np.zeros(3)
for idx , degree in enumerate(range(1, 4)):
    MSE[idx] = evalMSE([poly('horsepower', degree)],
                        'mpg',
                        Auto_train ,
                        Auto_valid)
MSE

array([23.61661707, 18.76303135, 18.79694163])

In [8]:
Auto_train , Auto_valid = train_test_split(Auto ,test_size=196, random_state=3)
MSE = np.zeros(3)
for idx , degree in enumerate(range(1, 4)):
    MSE[idx] = evalMSE([poly('horsepower', degree)],'mpg',Auto_train , Auto_valid)
MSE

array([20.75540796, 16.94510676, 16.97437833])

In [10]:
# sklearn_sm is a wrapper that enables us to easily use cross_validation tools.
# its first argument is a model from the library statsmodels; 
# it can take two additional arguments: model_str (to specify a formula) or model_args (dictionary of additional arguments to specify a family argument)

hp_model = sklearn_sm(sm.OLS, MS(['horsepower']))
X, Y = Auto.drop(columns=['mpg']), Auto['mpg']

# the arguments of cross_validate are: object with proper fit, predict or score methods; X features; Y response;
#cv specifies K results in K-fold cross validation
cv_results = cross_validate(hp_model, X, Y, cv = Auto.shape[0])

cv_err = np.mean(cv_results['test_score'])
cv_err

24.23151351792923

In [13]:
# We automate the procedure for increasingly complex polynomial fits
#We use the outer method after a math operation; 
# it takes two arrays as arguments and then forms a larger array where the operationis applied to each pair of elements of the two arrays.

cv_error = np.zeros(5)
H = np.array(Auto['horsepower'])
M = sklearn_sm(sm.OLS)
for i, d in enumerate(range(1,6)):
    X = np.power.outer(H, np.arange(d + 1))
    M_CV = cross_validate(M, X, Y, cv = Auto.shape[0])
    cv_error[i] = np.mean(M_CV['test_score'])

cv_error

array([24.23151352, 19.24821312, 19.33498406, 19.42443034, 19.0332226 ])

In [12]:
A = np.array([3, 5, 9])
B = np.array([2, 4])
np.add.outer(A, B)

array([[ 5,  7],
       [ 7,  9],
       [11, 13]])

In [None]:
# Now we use KFold to partition 