In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import linear_model
from tqdm import tqdm
from tabulate import tabulate
from sklearn.metrics import mean_squared_error


filename = 'sd-data_cleanedv1.csv'
data = pd.read_csv(filename)

cl_data = data.values

# Getting data and labels separated
X = np.delete(cl_data, 21, axis=1)
y = cl_data[:, 21]

In [28]:
# Define parameters
regularization_parameters = np.logspace(-5, 5, 10)

K_outer = 10  # Number of outer folds
K_inner = 5  # Number of inner folds

# Getting random splits (train and test 80/20) - for outer cross-validation loop
# We will now be able to use the split() method to generate indices to split data into training and test set.
outer_cv = KFold(n_splits=K_outer, shuffle=True)

# Initialize variable - this will contain the test error of the best model when tested on D_test (outer)
final_preformance_for_k_outer = np.empty((2, K_outer))

# Looping over the outer splits
for i_outer, (train_outer_index, test_outer_index) in enumerate(outer_cv.split(X)):
    print(f'Outer Fold {i_outer + 1}/{K_outer}')

    # train_outer_index and test_outer_index is a list of indicies as the name suggests.
    # we now assign the data, in allignment with the splits, to varaibles
    X_train_outer, X_test_outer = X[train_outer_index, :], X[test_outer_index, :]
    y_train_outer, y_test_outer = y[train_outer_index], y[test_outer_index]

    # Inner splits
    inner_cv = KFold(n_splits=K_inner, shuffle=True)
    # to keep track of all validation errors
    # rows corresponds to the i'th inner split. Cols is the different models
    error_inner = np.empty((K_inner, len(regularization_parameters)))

    for i_inner, (train_inner_index, val_inner_index) in tqdm(enumerate(inner_cv.split(X_train_outer)), total = K_inner):
        X_train_inner, X_val_inner = X_train_outer[train_inner_index, :], X_train_outer[val_inner_index, :]
        y_train_inner, y_val_inner = y_train_outer[train_inner_index], y_train_outer[val_inner_index]

        for k in range(len(regularization_parameters)):
            dtc = linear_model.Ridge(alpha=regularization_parameters[k])
            dtc = dtc.fit(X_train_inner, y_train_inner)

            y_pred_inner = dtc.predict(X_val_inner)
            mse_inner = mean_squared_error(y_val_inner, y_pred_inner)
            error_inner[i_inner, k] = mse_inner



    avg_inner = np.mean(error_inner, axis=0)

    # Training best model on train_outer 
    dtc = linear_model.Ridge(alpha=regularization_parameters[np.argmin(avg_inner)])
    dtc = dtc.fit(X_train_outer, y_train_outer)
    # Getting the test error of the best model when tested on D_test (outer)
    y_pred_outer = dtc.predict(X_test_outer)

    mse_outer = mean_squared_error(y_test_outer, y_pred_outer)

    final_preformance_for_k_outer[0, i_outer] = mse_outer
    final_preformance_for_k_outer[1, i_outer] = regularization_parameters[np.argmin(avg_inner)] # Also saving num of neighbors for the best model

################## output
list1 = ['Outer_1', 'Outer_2', 'Outer_3', 'Outer_4', 'Outer_5', 'Outer_6', 'Outer_7', 'Outer_8', 'Outer_9', 'Outer_10']

table_data = [
    ['E_i'] + [final_preformance_for_k_outer[0, i] for i in range(K_outer)], # +1
    ['Parameter'] + list(final_preformance_for_k_outer[1, :])
]

# Printing all test errors along with the num of neighbors - based on best model for each outer loop
print(tabulate(table_data, headers=list1, tablefmt='orgtbl'))
# printing generalization error
print(f'E_gen = {np.mean(final_preformance_for_k_outer[0, :], axis=0)}')

Outer Fold 1/10


100%|██████████| 5/5 [00:06<00:00,  1.23s/it]


Outer Fold 2/10


100%|██████████| 5/5 [00:06<00:00,  1.23s/it]


Outer Fold 3/10


100%|██████████| 5/5 [00:06<00:00,  1.31s/it]


Outer Fold 4/10


100%|██████████| 5/5 [00:06<00:00,  1.27s/it]


Outer Fold 5/10


100%|██████████| 5/5 [00:06<00:00,  1.25s/it]


Outer Fold 6/10


100%|██████████| 5/5 [00:06<00:00,  1.24s/it]


Outer Fold 7/10


100%|██████████| 5/5 [00:06<00:00,  1.24s/it]


Outer Fold 8/10


100%|██████████| 5/5 [00:06<00:00,  1.21s/it]


Outer Fold 9/10


100%|██████████| 5/5 [00:06<00:00,  1.28s/it]


Outer Fold 10/10


100%|██████████| 5/5 [00:06<00:00,  1.27s/it]

|           |   Outer_1 |   Outer_2 |   Outer_3 |   Outer_4 |   Outer_5 |   Outer_6 |   Outer_7 |   Outer_8 |   Outer_9 |   Outer_10 |
|-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------|
| E_i       |   0.48964 |  0.486367 |  0.487893 |  0.484257 |  0.484351 |  0.492499 |  0.494081 |  0.489033 |  0.496159 |   0.486735 |
| Parameter |  46.4159  | 46.4159   | 46.4159   | 46.4159   | 46.4159   | 46.4159   | 46.4159   | 46.4159   | 46.4159   | 599.484    |
E_gen = 0.4891014886052113



