In [6]:

import numpy as np
def train_models_and_calc_scores_for_n_fold_cv(
        estimator, x_NF, y_N, n_folds=3, random_state=0):
    ''' Perform n-fold cross validation for a specific sklearn estimator object

    Args
    ----
    estimator : any regressor object with sklearn-like API
        Supports 'fit' and 'predict' methods.
    x_NF : 2D numpy array, shape (n_examples, n_features) = (N, F)
        Input measurements ("features") for all examples of interest.
        Each row is a feature vector for one example.
    y_N : 1D numpy array, shape (n_examples,)
        Output measurements ("responses") for all examples of interest.
        Each row is a scalar response for one example.
    n_folds : int
        Number of folds to divide provided dataset into.
    random_state : int or numpy.RandomState instance
        Allows reproducible random splits.

    Returns
    -------
    train_error_per_fold : 1D numpy array, size n_folds
        One entry per fold
        Entry f gives the error computed for train set for fold f
    test_error_per_fold : 1D numpy array, size n_folds
        One entry per fold
        Entry f gives the error computed for test set for fold f

    '''
    N, F = x_NF.shape
    train_error_per_fold = np.zeros(n_folds, dtype=np.float32)
    test_error_per_fold = np.zeros(n_folds, dtype=np.float32)

    # TODO define the folds here by calling your function
    train_ids_per_fold, test_ids_per_fold = make_train_and_test_row_ids_for_n_fold_cv(N,n_folds,random_state)

    # TODO loop over folds and compute the train and test error
    # for the provided estimator
    for i in range(n_folds):
        train_x = []
        train_y = []
        test_x = []
        test_y = []
        for j in train_ids_per_fold[i]:
            train_x.append(x_NF[j])
            train_y.append(y_N[j])
        for j in test_ids_per_fold[i]:
            test_x.append(x_NF[j])
            test_y.append(y_N[j])
        model = estimator.__class__(**estimator.get_params())
        model.fit(train_x, train_y)
        predict_y1 = model.predict(train_x)
        predict_y2 = model.predict(test_x)
        train_error_per_fold[i]=calc_root_mean_squared_error(train_y, predict_y1)
        test_error_per_fold[i]=calc_root_mean_squared_error(test_y, predict_y2)
    return train_error_per_fold, test_error_per_fold


def make_train_and_test_row_ids_for_n_fold_cv(
        n_examples=0, n_folds=3, random_state=0):
    ''' Divide row ids into train and test sets for n-fold cross validation.

    Will *shuffle* the row ids via a pseudorandom number generator before
    dividing into folds.

    Args
    ----
    n_examples : int
        Total number of examples to allocate into train/test sets
    n_folds : int
        Number of folds requested
    random_state : int or numpy RandomState object
        Pseudorandom number generator (or seed) for reproducibility

    Returns
    -------
    train_ids_per_fold : list of 1D np.arrays
        One entry per fold
        Each entry is a 1-dim numpy array of unique integers between 0 to N
    test_ids_per_fold : list of 1D np.arrays
        One entry per fold
        Each entry is a 1-dim numpy array of unique integers between 0 to N

    Guarantees for Return Values
    ----------------------------
    Across all folds, guarantee that no two folds put same object in test set.
    For each fold f, we need to guarantee:
    * The *union* of train_ids_per_fold[f] and test_ids_per_fold[f]
    is equal to [0, 1, ... N-1]
    * The *intersection* of the two is the empty set
    * The total size of train and test ids for any fold is equal to N
    '''
    if hasattr(random_state, 'rand'):
        # Handle case where provided random_state is a random generator
        # (e.g. has methods rand() and randn())
        random_state = random_state  # just remind us we use the passed-in value
    else:
        # Handle case where we pass "seed" for a PRNG as an integer
        random_state = np.random.RandomState(int(random_state))

    train_ids_per_fold = list()
    test_ids_per_fold = list()

    # TODO obtain a shuffled order of the n_examples
    row_ids = random_state.permutation(n_examples)
    folds = np.array_split(row_ids, n_folds)
    # TODO loop over folds, establish which indices belong each fold
    for i in range(n_folds):
    # TODO assign those indices to the fold's test set
        test_data = list(folds[i])
    # TODO assign remaining indices to the fold's train set
        train_data=[]
        for j in range(n_folds):
            if j!=i:
                train_data += list(folds[j])
        test_ids_per_fold.append(test_data)
        train_ids_per_fold.append(train_data)

    return train_ids_per_fold, test_ids_per_fold

In [7]:
N = 4
K = 3
tr_ids_per_fold_K, te_ids_per_fold_K = (make_train_and_test_row_ids_for_n_fold_cv(N, K))

# There should be K=3 entries in each returned list
len(tr_ids_per_fold_K)

[]
[1]
[1]
[0]
[]
[2 3]
[2, 3]
[0]
[]
[2 3]
[2, 3]
[1]


3