RollingWindow cross-validation

A cross-validation strategy for timeseries, see http://robjhyndman.com/hyndsight/tscvexample Initial commit, tests and unfinished docs
0x0L · Oct 6, 2014 · c90a335 · c90a335
1 parent da74962
commit c90a335
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 12 deletions.
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -166,6 +166,7 @@ Classes
    cross_validation.LeaveOneOut
    cross_validation.LeavePLabelOut
    cross_validation.LeavePOut
+   cross_validation.RollingWindow
    cross_validation.StratifiedKFold
    cross_validation.ShuffleSplit
    cross_validation.StratifiedShuffleSplit

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
@@ -261,7 +261,7 @@ training set::
   [0 1 2] [3]
 
 
-Potential users of LOO for model selection should weigh a few known caveats. 
+Potential users of LOO for model selection should weigh a few known caveats.
 When compared with :math:`k`-fold cross validation, one builds :math:`n` models
 from :math:`n` samples instead of :math:`k` models, where :math:`n > k`.
 Moreover, each is trained on :math:`n - 1` samples rather than
@@ -275,10 +275,10 @@ the :math:`n` samples are used to build each model, models constructed from
 folds are virtually identical to each other and to the model built from the
 entire training set.
 
-However, if the learning curve is steep for the training size in question, 
+However, if the learning curve is steep for the training size in question,
 then 5- or 10- fold cross validation can overestimate the generalization error.
 
-As a general rule, most authors, and empirical evidence, suggest that 5- or 10- 
+As a general rule, most authors, and empirical evidence, suggest that 5- or 10-
 fold cross validation should be preferred to LOO.
 
 
@@ -290,7 +290,7 @@ fold cross validation should be preferred to LOO.
  * L. Breiman, P. Spector `Submodel selection and evaluation in regression: The X-random case
    <http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/197.pdf>`_, International Statistical Review 1992
  * R. Kohavi, `A Study of Cross-Validation and Bootstrap for Accuracy Estimation and Model Selection
-   <http://www.cs.iastate.edu/~jtian/cs573/Papers/Kohavi-IJCAI-95.pdf>`_, Intl. Jnt. Conf. AI   
+   <http://www.cs.iastate.edu/~jtian/cs573/Papers/Kohavi-IJCAI-95.pdf>`_, Intl. Jnt. Conf. AI
  * R. Bharat Rao, G. Fung, R. Rosales, `On the Dangers of Cross-Validation. An Experimental Evaluation
    <http://www.siam.org/proceedings/datamining/2008/dm08_54_Rao.pdf>`_, SIAM 2008
  * G. James, D. Witten, T. Hastie, R Tibshirani, `An Introduction to
@@ -383,8 +383,6 @@ Example of Leave-2-Label Out::
 Random permutations cross-validation a.k.a. Shuffle & Split
 -----------------------------------------------------------
 
-:class:`ShuffleSplit`
-
 The :class:`ShuffleSplit` iterator will generate a user defined number of
 independent train / test dataset splits. Samples are first shuffled and
 then split into a pair of train and test sets.
@@ -408,11 +406,28 @@ Here is a usage example::
 validation that allows a finer control on the number of iterations and
 the proportion of samples in on each side of the train / test split.
 
-See also
+.. note::
---------
+
-:class:`StratifiedShuffleSplit` is a variation of *ShuffleSplit*, which returns
+    See also :class:`StratifiedShuffleSplit`: this is a variation of
-stratified splits, *i.e* which creates splits by preserving the same
+    *ShuffleSplit*, which returns stratified splits, *i.e* which creates
-percentage for each target class as in the complete set.
+    splits by preserving the same percentage for each target class as in
+    the complete set.
+
+Rolling window
+--------------
+
+:class:`RollingWindow` is a strategy suited for timeseries.
+
+Here is a usage example::
+
+  >>> rw = cross_validation.RollingWindow(5)
+  >>> for train_index, test_index in rw:
+  ...     print("%s %s" % (train_index, test_index))
+  ...
+  [0] [1]
+  [0 1] [2]
+  [0 1 2] [3]
+  [0 1 2 3] [4]
 
 A note on shuffling
 ===================

diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -27,7 +27,7 @@
 from .utils.multiclass import type_of_target
 from .externals.joblib import Parallel, delayed, logger
 from .externals.six import with_metaclass
-from .externals.six.moves import zip
+from .externals.six.moves import zip, xrange
 from .metrics.scorer import check_scoring
 
 __all__ = ['Bootstrap',
@@ -36,6 +36,7 @@
            'LeaveOneOut',
            'LeavePLabelOut',
            'LeavePOut',
+           'RollingWindow',
            'ShuffleSplit',
            'StratifiedKFold',
            'StratifiedShuffleSplit',
@@ -1073,6 +1074,107 @@ def __len__(self):
         return self.n_iter
 
 
+class RollingWindow(object):
+    """Rolling window cross-validation strategy for timeseries
+
+    Provides train/test indices increasing with time.
+
+    Parameters
+    ----------
+    n : int
+        Total number of elements in the dataset.
+
+    test_size : float, int (default is 1)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the test split. If
+        int, Represents the absolute number of test samples.
+
+    train_size : float, int, or None (default is None)
+        If float, should be between 0.0 and 1.0 and represent the
+        proportion of the dataset to include in the test split. If
+        int, represents the absolute number of test samples. If None,
+        the train size grows progressively.
+
+    delay : int (default is 0)
+        Delay between the train and the test sets.
+
+    step : float, int, or None (default is None)
+        Increment. If None, step is set equal to test_size.
+
+    Examples
+    --------
+    >>> from sklearn import cross_validation
+    >>> rw = cross_validation.RollingWindow(5)
+    >>> len(rw)
+    4
+    >>> print(rw)
+    RollingWindow(5, test_size=1, train_size=None, delay=0, step=None)
+    >>> for train_index, test_index in rw:
+    ...    print("TRAIN:", train_index, "TEST:", test_index)
+    ...
+    TRAIN: [0] TEST: [1]
+    TRAIN: [0 1] TEST: [2]
+    TRAIN: [0 1 2] TEST: [3]
+    TRAIN: [0 1 2 3] TEST: [4]
+
+    References
+    ----------
+    See http://robjhyndman.com/hyndsight/tscvexample
+    """
+
+    def __init__(self, n, test_size=1, train_size=None, delay=0, step=None):
+        self.n = n
+        self.test_size = test_size
+        self.train_size = train_size
+        self.delay = delay
+        self.step = step
+
+        self.n_train, self.n_test = _validate_shuffle_split(n,
+                                    test_size,
+                                    train_size)
+        self.step_ = self.n_test
+        if np.asarray(step).dtype.kind == 'f':
+            self.step_ = int(ceil(step * n))
+        elif np.asarray(step).dtype.kind == 'i':
+            self.step_ = step
+
+    def __iter__(self):
+        start_train = 0
+
+        first_idx = 1 if self.train_size is None else self.n_train
+        first_idx += self.delay
+
+        for start_test in xrange(first_idx, self.n, self.step_):
+
+            end_test = min(start_test + self.n_test, self.n)
+
+            if self.train_size is not None:
+                start_train = start_test - self.delay - self.n_train
+
+            train = np.arange(start_train, start_test - self.delay)
+            test = np.arange(start_test, end_test)
+
+            yield train, test
+
+    def __len__(self):
+        first_idx = self.n_train if self.train_size is not None else 1
+        first_idx += self.delay
+
+        l = (self.n - first_idx) / self.step_
+        return int(ceil(l))
+
+    def __repr__(self):
+        return ('%s(%i, test_size=%i, train_size=%s, delay=%i, '
+            'step=%s)' % (
+                self.__class__.__name__,
+                self.n,
+                self.test_size,
+                str(self.train_size),
+                self.delay,
+                str(self.step),
+            ))
+
+
 ##############################################################################
 
 

diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -461,6 +461,33 @@ def test_leave_label_out_changing_labels():
             assert_array_equal(test, test_chan)
 
 
+def test_rolling_window_split():
+    rw1 = cval.RollingWindow(10, test_size=0.2)
+    rw2 = cval.RollingWindow(10, test_size=2)
+    rw3 = cval.RollingWindow(10, test_size=np.int32(2))
+    for typ in six.integer_types:
+        rw4 = cval.RollingWindow(10, test_size=typ(2))
+    for t1, t2, t3, t4 in zip(rw1, rw2, rw3, rw4):
+        assert_array_equal(t1[0], t2[0])
+        assert_array_equal(t2[0], t3[0])
+        assert_array_equal(t3[0], t4[0])
+        assert_array_equal(t1[1], t2[1])
+        assert_array_equal(t2[1], t3[1])
+        assert_array_equal(t3[1], t4[1])
+    rw5 = cval.RollingWindow(5, train_size=2)
+    assert_equal(len(rw5), 3)
+    for t in rw5:
+        assert_equal(len(t[0]), 2)
+        assert_equal(len(t[1]), 1)
+    rw6 = cval.RollingWindow(10, step=0.2)
+    rw7 = cval.RollingWindow(10, step=2)
+    for t1, t6, t7 in zip(rw1, rw6, rw7):
+        assert_array_equal(t1[0], t6[0])
+        assert_array_equal(t1[0], t7[0])
+        assert_equal(t1[1][0], t6[1][0])
+        assert_equal(t1[1][0], t7[1][0])
+
+
 def test_cross_val_score():
     clf = MockClassifier()
     for a in range(-10, 10):