# Validation set approach

Import all the packages and functions needed

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

Import the data from the file Auto.csv and remove all columns that are not complete.

In [3]:
auto = pd.read_csv('./Data/Auto.csv',na_values='?')
auto = auto.dropna()

We want to predict the mpg from the horsepower in the auto dataframe. In the simple linear regression task we already saw, that the linear fit may not be the best choice. 

a) Randomly split the data into a training and test set. 

In [12]:
np.random.seed(0)

training = np.random.choice([False,True], size=392)

y_train = auto.mpg[training]
X_train = sm.add_constant(auto.horsepower[training])

y_test = auto.mpg[~training]
X_test = sm.add_constant(auto.horsepower[~training])

b) Fit a linear regression model on the training set and compute the MSE with the predictions on the validation set (test set).

In [15]:
linear_model = sm.OLS(y_train, X_train)
linear_results = linear_model.fit()

y_predictions = linear_results.predict(X_test)
mse = np.mean((y_predictions - y_test) ** 2)
print(f'Mean Squared Error: {mse:.4f}')

Mean Squared Error: 20.8574


c) Fit a quadratic regression model on the training set and compute the MSE with the predictions on the validation set (test set).

In [17]:
X_train = sm.add_constant(np.column_stack((auto.horsepower[training], auto.horsepower[training]**2)))
X_test = sm.add_constant(np.column_stack((auto.horsepower[~training], auto.horsepower[~training]**2)))

quad_model = sm.OLS(y_train, X_train)
quad_results = quad_model.fit()

y_predictions_quad = quad_results.predict(X_test)
mse_quad = np.mean((y_predictions_quad - y_test) ** 2)  
print(f'Mean Squared Error (Quadratic): {mse_quad:.4f}')   
# Compare the two models
if mse_quad < mse:
    print("The quadratic model has a lower MSE than the linear model.") 
else:
    print("The linear model has a lower MSE than the quadratic model.")
# Display the summary of the quadratic model
print(quad_results.summary())
# Display the summary of the linear model
print(linear_results.summary())
# Display the coefficients of the linear model
print("Linear Model Coefficients:")
print(linear_results.params)
# Display the coefficients of the quadratic model
print("Quadratic Model Coefficients:")
print(quad_results.params)
# Display the R-squared values of both models
print(f"Linear Model R-squared: {linear_results.rsquared:.4f}")
print(f"Quadratic Model R-squared: {quad_results.rsquared:.4f}")
# Display the p-values of the coefficients for both models
print("Linear Model P-values:") 
print(linear_results.pvalues)
print("Quadratic Model P-values:")
print(quad_results.pvalues)
# Display the confidence intervals for the coefficients of both models
print("Linear Model Confidence Intervals:")
print(linear_results.conf_int())
print("Quadratic Model Confidence Intervals:")
print(quad_results.conf_int())
# Display the residuals of both models
print("Linear Model Residuals:")
print(linear_results.resid)
print("Quadratic Model Residuals:")
print(quad_results.resid)

Mean Squared Error (Quadratic): 16.4569
The quadratic model has a lower MSE than the linear model.
                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.674
Model:                            OLS   Adj. R-squared:                  0.670
Method:                 Least Squares   F-statistic:                     209.4
Date:                Thu, 12 Jun 2025   Prob (F-statistic):           4.47e-50
Time:                        17:49:02   Log-Likelihood:                -608.65
No. Observations:                 206   AIC:                             1223.
Df Residuals:                     203   BIC:                             1233.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------

d) Fit a cubic regression model on the training set and compute the MSE with the predictions on the validation set (test set).

In [19]:
X_train = sm.add_constant(np.column_stack((auto.horsepower[training], auto.horsepower[training] ** 2, auto.horsepower[training] ** 3)))
X_test = sm.add_constant(np.column_stack((auto.horsepower[~training], auto.horsepower[~training] ** 2, auto.horsepower[~training] ** 3)))

model = sm.OLS(y_train, X_train)
cubic_results = model.fit()

y_predictions_cubic = cubic_results.predict(X_test)
mse_cubic = np.mean((y_predictions_cubic - y_test) ** 2)
print(f'Mean Squared Error (Cubic): {mse_cubic:.4f}\n')
# Compare the cubic model with the previous models
if mse_cubic < mse_quad and mse_cubic < mse:
    print("The cubic model has the lowest MSE among all models.")
else:
    print("The cubic model does not have the lowest MSE compared to the previous models.")
# Display the summary of the cubic model
print(cubic_results.summary())

Mean Squared Error (Cubic): 16.4427

The cubic model has the lowest MSE among all models.
                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.674
Model:                            OLS   Adj. R-squared:                  0.669
Method:                 Least Squares   F-statistic:                     139.4
Date:                Thu, 12 Jun 2025   Prob (F-statistic):           5.95e-49
Time:                        17:51:30   Log-Likelihood:                -608.44
No. Observations:                 206   AIC:                             1225.
Df Residuals:                     202   BIC:                             1238.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------