In [1]:
import numpy as np
import pandas as pd

class TestSpec:
    '''Class that defines specifications for inter-model tests'''
    
    def __init__(self, indices, start_dates, window_size):
        # Test sets
        if not isinstance(indices,pd.DatetimeIndex):
            raise TypeError('indices param must be of type pandas.DatetimeIndex')
            
        if not isinstance(start_dates, list):
            raise TypeError('start_dates param must be a list of strings')
            
        if not isinstance(window_size, int):    
            raise TypeError('window_size param must be an integer')
        
        # Sort values of start/end and indices
        start_dates.sort()
        indices = indices.sort_values()
        
        # List of test instances (test set + CV sets + FV sets)
        self.test = []
        n_tests = len(start_dates)
        for i in range(n_tests-1):
            self.test.append(self.TestInstance(indices, start_dates[i], start_dates[i+1]))
        self.test.append(self.TestInstance(indices, start_dates[-1], None))
        
    class TestInstance:        
        def __init__(self, indices, start_date, end_date):
            test_indices = indices[indices >= start_date]
            if end_date is not None:
                test_indices = test_indices[test_indices < end_date]
                
            train_val_indices_expanding = indices[indices < start_date]
            train_val_indices_sliding = train_val_indices_expanding[-window_size:]

            self.test_set = test_indices
            self.expanding_window_fv =  self.ForwardValidation(train_val_indices_expanding) # forward-validation
            self.expanding_window_cv =  self.CrossValidation(train_val_indices_expanding)   # cross-validation
            self.sliding_window_fv =  self.ForwardValidation(train_val_indices_sliding) # forward-validation
            self.sliding_window_cv =  self.CrossValidation(train_val_indices_sliding)   # cross-validation
        
        class CrossValidation:
            '''Class that defines the folds of Blocked Cross Validation for a test instance'''
            purge_size = 7
            n_folds = 5
            n_blocks_per_fold = 2

            def __init__(self, train_val_inds):
#                 print(len(train_val_inds))
                k = self.n_folds
                n_blocks_per_k = self.n_blocks_per_fold
                n_purge = self.purge_size
                
                n_blocks = k*n_blocks_per_k
                inds = train_val_inds
                n_inds = len(inds)

                # Split into consecutive n_blocks
                p = 0
                block_inds = []
                for i in range(n_blocks):
                    nb = int(np.ceil((n_inds-p)/(n_blocks-i)))
                    block_inds.append(np.arange(p,min(p+nb, n_inds)))
                    p += nb

                # Random order
                block_ord = np.random.permutation(n_blocks).reshape(5,2)
#                 print(block_ord)
#                 for i in range(n_blocks):
#                     print(inds[block_inds[i][0]], inds[block_inds[i][-1]])
#                     print(len(block_inds[i]))

                # Select n_blocks_per_k
                self.train_sets = []
                self.val_sets = []
                proportion = float(1/n_blocks)
                val_purge = int(np.ceil(proportion*n_purge)) # number of purge elements to be removed from val set
                for i in range(k):
                    select_array = np.full(n_inds, fill_value=1) # 0: purge / 1: train / 2:val
                    if abs(block_ord[i,0]-block_ord[i,1]) == 1: # no need to purge between the sets
                        purge = False
                        if block_ord[i,0]-block_ord[i,1] == -1: # which block comes first
                            first = 0
                        else:
                            first = 1
                    else: 
                        purge = True
                        
                    for j in range(n_blocks_per_k):
                        # Select validation and purge indices
                        val_inds = block_inds[block_ord[i,j]]
                        purge_inds_before = np.arange(0)
                        purge_inds_after = np.arange(0)
                        if (purge==True or j==first) and val_inds[0] != 0: 
                            val_inds = val_inds[val_purge:]  # purge before block
                            purge_inds_before = np.arange(max(0,val_inds[0]-n_purge), val_inds[0])
                        if (purge==True or j!=first) and val_inds[-1] != (n_inds-1):
                            val_inds = val_inds[:-val_purge] # purge after block
                            purge_inds_after = np.arange(val_inds[-1]+1, min(n_inds, val_inds[-1]+n_purge+1))
                            
                        select_array[val_inds] = 2
                        select_array[purge_inds_before] = 0
                        select_array[purge_inds_after] = 0

                    self.train_sets.append(inds[select_array==1])
                    self.val_sets.append(inds[select_array==2])

                for i in range(k):
                    assert len(np.intersect1d(self.train_sets[i],self.val_sets[i])) is 0
                    for j in range(i+1, k):
                        assert len(np.intersect1d(self.val_sets[i],self.val_sets[j])) is 0 
                        
                    
        class ForwardValidation:
            '''Class that defines the Forward Validation sets for a test instance'''
            val_test_ratio = 0.2
            n_validation_sets = 5
            purge_size = 7

            def __init__(self, train_val_inds):
                n = self.n_validation_sets
                ratio = self.val_test_ratio
                n_purge = self.purge_size
                
                inds = train_val_inds
                n_inds = len(inds)
                
                # Vector with the ratios (train+val)/train for each validation (e.g., [1, 1.2, ..., 1.2])
                ratio_vec = np.r_[1, np.full(n-1, fill_value = 1+ratio)]
                
                # Percentage of samples to be used in first train set so that the ratio can be kept
                perc_samples = 1 / (1 + ratio * np.sum(np.cumprod(ratio_vec)))
                
                # Final indices of each training set
                train_inds = np.round(n_inds * perc_samples * np.cumprod(ratio_vec)).astype(int)
                train_inds = np.r_[train_inds, n_inds]
#                 print(train_inds)
#                 print(train_inds[1:]/train_inds[:-1])
                
                # Defines each training set
                self.train_sets = []
                self.val_sets = []
                val_purge = int(np.ceil(ratio*n_purge)) # number of purge elements to be removed from val set
                train_purge = n_purge - val_purge
                for i in range(n):
                    self.train_sets.append(inds[:train_inds[i]-train_purge])
                    self.val_sets.append(inds[train_inds[i]+val_purge:train_inds[i+1]])
                    
                return
            
    

In [2]:
all_inds = pd.DatetimeIndex(start='2015-01-01', end='2018-05-31', freq= pd.DateOffset(days=1))
start_dates = [ '2017-09', '2017-01','2018-01', '2017-05']
window_size = 365
np.random.seed(12)
my_test = TestSpec(indices=all_inds, start_dates=start_dates, window_size=window_size)

[294 353 423 508 609 731]
[1.20068027 1.19830028 1.20094563 1.1988189  1.20032841]
[147 176 211 253 304 365]
[1.19727891 1.19886364 1.19905213 1.20158103 1.20065789]
[342 410 492 591 709 851]
[1.19883041 1.2        1.20121951 1.19966159 1.20028209]
[147 176 211 253 304 365]
[1.19727891 1.19886364 1.19905213 1.20158103 1.20065789]
[391 470 564 676 812 974]
[1.20204604 1.2        1.19858156 1.20118343 1.19950739]
[147 176 211 253 304 365]
[1.19727891 1.19886364 1.19905213 1.20158103 1.20065789]
[ 440  529  634  761  913 1096]
[1.20227273 1.19848771 1.20031546 1.19973719 1.20043812]
[147 176 211 253 304 365]
[1.19727891 1.19886364 1.19905213 1.20158103 1.20065789]


In [3]:
i = 0
j = 0

In [7]:
for j in range(0,5):
    print(my_test.test[i].test_set)
    print(my_test.test[i].sliding_window_fv.val_sets[j])
    print(my_test.test[i].sliding_window_fv.train_sets[j],'\n\n')

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-04-21', '2017-04-22', '2017-04-23', '2017-04-24',
               '2017-04-25', '2017-04-26', '2017-04-27', '2017-04-28',
               '2017-04-29', '2017-04-30'],
              dtype='datetime64[ns]', length=120, freq='<DateOffset: kwds={'days': 1}>')
DatetimeIndex(['2016-05-30', '2016-05-31', '2016-06-01', '2016-06-02',
               '2016-06-03', '2016-06-04', '2016-06-05', '2016-06-06',
               '2016-06-07', '2016-06-08', '2016-06-09', '2016-06-10',
               '2016-06-11', '2016-06-12', '2016-06-13', '2016-06-14',
               '2016-06-15', '2016-06-16', '2016-06-17', '2016-06-18',
               '2016-06-19', '2016-06-20', '2016-06-21', '2016-06-22',
               '2016-06-23', '2016-06-24', '2016-06-25'],
              dtype='datetime64[ns]',

In [4]:
for j in range(0,5):
    print(my_test.test[i].test_set)
    print(my_test.test[i].expanding_window_fv.val_sets[j])
    print(my_test.test[i].expanding_window_fv.train_sets[j],'\n\n')

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10',
               ...
               '2017-04-21', '2017-04-22', '2017-04-23', '2017-04-24',
               '2017-04-25', '2017-04-26', '2017-04-27', '2017-04-28',
               '2017-04-29', '2017-04-30'],
              dtype='datetime64[ns]', length=120, freq='<DateOffset: kwds={'days': 1}>')
DatetimeIndex(['2015-10-24', '2015-10-25', '2015-10-26', '2015-10-27',
               '2015-10-28', '2015-10-29', '2015-10-30', '2015-10-31',
               '2015-11-01', '2015-11-02', '2015-11-03', '2015-11-04',
               '2015-11-05', '2015-11-06', '2015-11-07', '2015-11-08',
               '2015-11-09', '2015-11-10', '2015-11-11', '2015-11-12',
               '2015-11-13', '2015-11-14', '2015-11-15', '2015-11-16',
               '2015-11-17', '2015-11-18', '2015-11-19', '2015-11-20',
               '2015-11-

In [None]:
for j in range(0,5):
    print(my_test.test[i].test_set)
    print(my_test.test[i].expanding_window_cv.val_sets[j])
    print(my_test.test[i].expanding_window_cv.train_sets[j],'\n\n')
# j += 1

In [None]:
[len(train_sets[i]) for i in range(5)]
# [len(val_sets[i]) for i in range(5)]
# [len(val_sets[i])/len(train_sets[i]) for i in range(5)]
# print(train_sets)
# val_sets