# Lab: Cross Validation and Bootstrap

In [1]:
import numpy as np 
import pandas as pd 

from matplotlib.pyplot import subplots 
from plotnine import *

import statsmodels.api as sm

from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)

from sklearn.model_selection import train_test_split

from functools import partial
from sklearn.model_selection import ( \
    cross_validate,
    KFold,
    ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

In [16]:
random_state=3

In [17]:
Auto = load_data('Auto') # We load the Auto dataset
Auto_train, Auto_valid = train_test_split(Auto, test_size=196, random_state=random_state) # We split the data into traiing and validation sets. 
# There are 392 observations so we set test_size=196

In [3]:
Auto.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130,3504,12.0,70,1
buick skylark 320,15.0,8,350.0,165,3693,11.5,70,1
plymouth satellite,18.0,8,318.0,150,3436,11.0,70,1
amc rebel sst,16.0,8,304.0,150,3433,12.0,70,1
ford torino,17.0,8,302.0,140,3449,10.5,70,1


We fit a linear regression using only the observations corresponding to the training set `Auto_train`

In [5]:
hp_mm = MS(['horsepower'])
X_train = hp_mm.fit_transform(Auto_train)
y_train = Auto_train['mpg']
model = sm.OLS(y_train, X_train)
results = model.fit()

In [7]:
results.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.608
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,300.4
Date:,"Tue, 22 Oct 2024",Prob (F-statistic):,2.83e-41
Time:,10:42:50,Log-Likelihood:,-590.83
No. Observations:,196,AIC:,1186.0
Df Residuals:,194,BIC:,1192.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,39.9055,1.009,39.537,0.000,37.915,41.896
horsepower,-0.1563,0.009,-17.333,0.000,-0.174,-0.139

0,1,2,3
Omnibus:,7.263,Durbin-Watson:,2.175
Prob(Omnibus):,0.026,Jarque-Bera (JB):,6.993
Skew:,0.44,Prob(JB):,0.0303
Kurtosis:,3.286,Cond. No.,319.0


Now we use the `predict()` method of results evaluated on the model matrix using the validation data set to calcultate the validation MSE of our model.

In [10]:
X_valid = hp_mm.transform(Auto_valid)
y_valid = Auto_valid['mpg']
valid_pred = results.predict(X_valid) # we compute the 196 predictions from the the X_valid df
np.mean((y_valid - valid_pred)**2)

23.61661706966988

Let's define the validation MSE for higher degree polynomial regressions. We code a function `eval_MSE()` that takes a model string as well as a training set and returns the MSE on the test set.

In [11]:
def evalMSE(terms, response, train, test):
    '''
    Calculate the Mean Squared Error (MSE) between predicted and actual values.

    Parameters:
    ----------
    terms : array-like, shape (n_features,)
        The predictor variables (or features) used in the model.
        
    response : char
        The actual target value (or response) from the dataset.
        
    train : dataFrame
        The training data from the original dataFrame.
        
    test : dataFrame
        The testing data from the original dataFrame

    Returns:
    --------
    mse : float
        The calculated mean squared error on the test data.
    '''

    mm = MS(terms)
    X_train = mm.fit_transform(train)
    y_train = train[response]

    X_test = mm.transform(test)
    y_test = test[response]

    results = sm.OLS(y_train, X_train).fit()
    test_pred = results.predict(X_test)

    return np.mean((y_test - test_pred)**2)

Let's use this function using linear, quadratic and cubic fits.

In [20]:
MSE = np.zeros(3)
for idx, degree in enumerate(range(1,4)):
    MSE[idx] = evalMSE([poly('horsepower', degree)],
                       'mpg',
                       Auto_train,
                       Auto_valid
    )
MSE

array([20.75540796, 16.94510676, 16.97437833])

#### Interpretation
While changing random state, the results are consistent with our previous findings:
a model that predicts mpg using a quadratic function of `horsepower` performs better than a model that involves only linear function of `horsepower`. and there is no evidence of an imporvement in using a cubic function of `horsepower`

## Cross Validation