### Preprocessing

In [0]:
# import relevant statistical packages
import numpy as np
import pandas as pd

In [0]:
# import relevant data visualisation packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
# import and preprocess data
url = "abfss://training@sa8451learningdev.dfs.core.windows.net/interpretable_machine_learning/eml_data/Auto.csv"
df = spark.read.option("header", "true").csv(url).toPandas()

str_cols = ["name"]
num_cols = list(set(df.columns) - set(str_cols))
df[str_cols] = df[str_cols].astype(str)
df[num_cols] = df[num_cols].astype(float)

In [0]:
df.head()

### Using simple linear regression

In [0]:
X = df[['horsepower']]
y = df['mpg']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

In [0]:
from sklearn.linear_model import LinearRegression

In [0]:
lmfit = LinearRegression().fit(X_train, y_train)

In [0]:
lmpred = lmfit.predict(X_test)

In [0]:
from sklearn.metrics import mean_squared_error

In [0]:
MSE = mean_squared_error(y_test, lmpred)

In [0]:
round(MSE, 2)

### Using Leave-One-Out-Cross-Validation (LOOCV)

In [0]:
from sklearn.model_selection import LeaveOneOut

In [0]:
X = df[['horsepower']]
y = df['mpg']

In [0]:
loo = LeaveOneOut()

In [0]:
total_sets = loo.get_n_splits(X)

In [0]:
from sklearn.linear_model import LinearRegression

In [0]:
from sklearn.metrics import mean_squared_error

In [0]:
MSE = 0

In [0]:
for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lmfit = LinearRegression().fit(X_train, y_train)
    lmpred = lmfit.predict(X_test)
    MSE += mean_squared_error(y_test, lmpred)

In [0]:
MSE

In [0]:
MSE_mean = MSE/total_sets

In [0]:
round(MSE_mean, 2)

**As we can see, LOOCV provides similar MSE to simple linear regression. Therefore, in the absence of a test dataset, we can resample the existing dataset through LOOCV to generate similar results to simple linear regression.**

### Using Leave-One-Out-Cross-Validation (LOOCV) for polynomial regressions (order: 1-5)

In [0]:
from sklearn.preprocessing import PolynomialFeatures as PF

In [0]:
X = df[['horsepower']]
y = df['mpg']

In [0]:
loo = LeaveOneOut()

In [0]:
total_sets = loo.get_n_splits(X)

In [0]:
MSE_all = pd.DataFrame()

In [0]:
for i in range(1,6):
    MSE = 0
    X = df[['horsepower']]
    X_ = pd.DataFrame(PF(i).fit_transform(X))
    X_.drop(columns=0, inplace=True)
    y = df[['mpg']]
    for train_index, test_index in loo.split(X):
        X_train, X_test = X_.iloc[train_index], X_.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        lmfit = LinearRegression().fit(X_train, y_train)
        lmpred = lmfit.predict(X_test)
        MSE += mean_squared_error(y_test, lmpred)
    MSE_mean = MSE/total_sets
    MSE_all = MSE_all.append([MSE_mean])

In [0]:
MSE_all.columns = [['MSE']]
MSE_all.reset_index(drop=True, inplace=True)
round(MSE_all, 2)

**As we can see, there is a sharp drop between linear and quadratic regressions. Rest of the higher-order regressions
have similar MSEs.**