In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
from tabulate import tabulate

filename = 'sd-data_cleanedv1.csv'
data = pd.read_csv(filename)

cl_data = data.values

# Getting data and labels separated
X = cl_data[:, :-1]
y = cl_data[:, -1]

In [24]:
from sklearn.model_selection import train_test_split
# Getting only 20% off data for efficiency
X, X_throw_away, y, y_throw_away = train_test_split(X, y, test_size=0.80, random_state=42)


In [25]:
# Define parameters
#number_of_neighbors = range(55,58) # trail and error
number_of_neighbors = np.linspace(50, 80, 6)


K_outer = 10  # Number of outer folds
K_inner = 5  # Number of inner folds

# Getting random splits (train and test 90/10) - for outer cross-validation loop
# We will now be able to use the split() method to generate indices to split data into training and test set.
outer_cv = KFold(n_splits=K_outer, shuffle=True)

# Initialize variable - this will contain the test error of the best model when tested on D_test (outer)
final_preformance_for_k_outer = np.empty((2, K_outer))

# Looping over the outer splits
for i_outer, (train_outer_index, test_outer_index) in enumerate(outer_cv.split(X)):
    print(f'Outer Fold {i_outer + 1}/{K_outer}')

    # train_outer_index and test_outer_index is a list of indicies as the name suggests.
    # we now assign the data, in allignment with the splits, to varaibles
    X_train_outer, X_test_outer = X[train_outer_index, :], X[test_outer_index, :]
    y_train_outer, y_test_outer = y[train_outer_index], y[test_outer_index]

    # Inner splits
    inner_cv = KFold(n_splits=K_inner, shuffle=True)
    # to keep track of all validation errors
    # rows corresponds to the i'th inner split. Cols is the different models
    error_inner = np.empty((K_inner, len(number_of_neighbors)))

    for i_inner, (train_inner_index, val_inner_index) in tqdm(enumerate(inner_cv.split(X_train_outer)), total = K_inner):
        X_train_inner, X_val_inner = X_train_outer[train_inner_index, :], X_train_outer[val_inner_index, :]
        y_train_inner, y_val_inner = y_train_outer[train_inner_index], y_train_outer[val_inner_index]

        for k in range(len(number_of_neighbors)):
            dtc = KNeighborsClassifier(n_neighbors=int(number_of_neighbors[k]))
            dtc = dtc.fit(X_train_inner, y_train_inner)
            y_val_pred_inner = dtc.predict(X_val_inner)

            # Training model k on the i'th inner split here
            error_inner[i_inner, k] = np.sum(y_val_pred_inner != y_val_inner) / len(y_val_inner)
    
    # computing the avg error for all models on the i'th inner split
    avg_inner = np.mean(error_inner, axis=0)

    # Training best model on train_outer 
    dtc = KNeighborsClassifier(n_neighbors=int(number_of_neighbors[np.argmin(avg_inner)]))
    dtc = dtc.fit(X_train_outer, y_train_outer)
    y_test_pred_outer = dtc.predict(X_test_outer)
    # Getting the test error of the best model when tested on D_test (outer)
    final_preformance_for_k_outer[0, i_outer] = np.sum(y_test_pred_outer != y_test_outer) / len(y_test_outer)
    final_preformance_for_k_outer[1, i_outer] = number_of_neighbors[np.argmin(avg_inner)] # Also saving num of neighbors for the best model

################## output
list1 = ['///', 'Outer_1', 'Outer_2', 'Outer_3', 'Outer_4', 'Outer_5', 'Outer_6', 'Outer_7', 'Outer_8', 'Outer_9', 'Outer_10']

table_data = [
    ['E_i'] + [final_preformance_for_k_outer[0, i] for i in range(K_outer)], # +1
    ['Parameter'] + list(final_preformance_for_k_outer[1, :])
]

# Printing all test errors along with the num of neighbors - based on best model for each outer loop
print(tabulate(table_data, headers=list1, tablefmt='orgtbl'))
# printing generalization error
print(f'E_gen = {(1-float(np.mean(final_preformance_for_k_outer[0, :], axis=0)))*100}%')



Outer Fold 1/10


100%|██████████| 5/5 [01:19<00:00, 15.91s/it]


Outer Fold 2/10


100%|██████████| 5/5 [01:20<00:00, 16.02s/it]


Outer Fold 3/10


100%|██████████| 5/5 [01:18<00:00, 15.73s/it]


Outer Fold 4/10


100%|██████████| 5/5 [01:18<00:00, 15.70s/it]


Outer Fold 5/10


100%|██████████| 5/5 [01:17<00:00, 15.43s/it]


Outer Fold 6/10


100%|██████████| 5/5 [01:17<00:00, 15.55s/it]


Outer Fold 7/10


100%|██████████| 5/5 [01:17<00:00, 15.49s/it]


Outer Fold 8/10


100%|██████████| 5/5 [01:17<00:00, 15.56s/it]


Outer Fold 9/10


100%|██████████| 5/5 [01:16<00:00, 15.22s/it]


Outer Fold 10/10


100%|██████████| 5/5 [01:17<00:00, 15.46s/it]


| ///       |   Outer_1 |   Outer_2 |   Outer_3 |   Outer_4 |   Outer_5 |   Outer_6 |   Outer_7 |   Outer_8 |   Outer_9 |   Outer_10 |
|-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------|
| E_i       |  0.284122 |  0.277565 |  0.278422 |  0.275396 |  0.285649 |  0.284288 |  0.276974 |  0.286507 |  0.285195 |   0.285246 |
| Parameter | 68        | 80        | 74        | 80        | 74        | 80        | 80        | 74        | 62        |  80        |
E_gen = 71.80636343248048%
