# Validation set approach

Import all the packages and functions needed

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

Import the data from the file Auto.csv and remove all columns that are not complete.

In [None]:
auto = pd.read_csv('./Data/Auto.csv',na_values='?')
auto = auto.dropna()

We want to predict the mpg from the horsepower in the auto dataframe. In the simple linear regression task we already saw, that the linear fit may not be the best choice. 

a) Randomly split the data into a training and test set. 

In [None]:
np.random.seed(0)

training = np.random.choice([False,True], size=392)

y_train = auto.mpg[training]
X_train = sm.add_constant(auto.horsepower[training])

y_test = auto.mpg[~training]
X_test = sm.add_constant(auto.horsepower[~training])

b) Fit a linear regression model on the training set and compute the MSE with the predictions on the validation set (test set).

In [None]:
linear_model = sm.OLS(y_train, X_train)
linear_results = linear_model.fit()

y_predictions = linear_results.predict(X_test)
mse = np.mean((y_predictions - y_test) ** 2)
print(f'Mean Squared Error: {mse:.4f}')

c) Fit a quadratic regression model on the training set and compute the MSE with the predictions on the validation set (test set).

In [None]:
X_train = sm.add_constant(np.column_stack((auto.horsepower[training], auto.horsepower[training]**2)))
X_test = sm.add_constant(np.column_stack((auto.horsepower[~training], auto.horsepower[~training]**2)))

quad_model = sm.OLS(y_train, X_train)
quad_results = quad_model.fit()

y_predictions_quad = quad_results.predict(X_test)
mse_quad = np.mean((y_predictions_quad - y_test) ** 2)  
print(f'Mean Squared Error (Quadratic): {mse_quad:.4f}')   
# Compare the two models
if mse_quad < mse:
    print("The quadratic model has a lower MSE than the linear model.") 
else:
    print("The linear model has a lower MSE than the quadratic model.")
# Display the summary of the quadratic model
print(quad_results.summary())
# Display the summary of the linear model
print(linear_results.summary())

d) Fit a cubic regression model on the training set and compute the MSE with the predictions on the validation set (test set).

In [None]:
X_train = sm.add_constant(np.column_stack((auto.horsepower[training], auto.horsepower[training] ** 2, auto.horsepower[training] ** 3)))
X_test = sm.add_constant(np.column_stack((auto.horsepower[~training], auto.horsepower[~training] ** 2, auto.horsepower[~training] ** 3)))

model = sm.OLS(y_train, X_train)
cubic_results = model.fit()

y_predictions_cubic = cubic_results.predict(X_test)
mse_cubic = np.mean((y_predictions_cubic - y_test) ** 2)
print(f'Mean Squared Error (Cubic): {mse_cubic:.4f}\n')
# Compare the cubic model with the previous models
if mse_cubic < mse_quad and mse_cubic < mse:
    print("The cubic model has the lowest MSE among all models.")
else:
    print("The cubic model does not have the lowest MSE compared to the previous models.")
# Display the summary of the cubic model
print(cubic_results.summary())