In [1]:
from dataclasses import dataclass
from typing import Union, Optional
import numpy as np
import pandas as pd

In [2]:
@dataclass
class TimeConfig:
    """Container for storing and accessing time configuration."""
    start_year: str
    end_year: str
    target_start: str = '11-01'
    target_end: str = '12-01'
    freq: Optional[str] = '2M' # daily/monthly
    # tfreq: Union[int, str] = 7
    
    def get_dates_load(self):
        """Return index which lines up with the bins
        Leap days are removed.

        (see the function "timeseries_tofit_bins" in line 628 of functions_pp)

        return pd.DateTimeIndex
        """

        return

    def get_resample_bins(self):
        """Return bins to aggregate to n day/ n month means
        This function could work with the precursor and target variables.
        """

        return

    def split_train_test_groups(self):
        """
        """
        return

In [None]:
tc = TimeConfig(start_year, end_year, target_start, target_end, freq)
tc.dates2bin # time passed to data loader
tc.labelbins # same axis as dates2bin, but has labels to group by n day/ n month means
tc.traintestgroups # smart splitting of train/test data and avoid overlapping
tc.df_split # aggregated time axis pandas dataframe with columns train=true and RV_mask

In [3]:
@dataclass
class TrainTestSet:
    """Base class for various traintest methods."""
    df: pd.DataFrame
   
    def get_test_labels(self):
        return self.df.query('traintest==1')
       
    def get_training_labels(self):
        return self.df.query('traintest==2')

class LeaveOutN(TrainTestSet):
    """TrainTestSet based on a leave-n-out sampling method."""
    def __init__(self, timeconfig, n, max_lag=None):
        times = timeconfig.datetimes
        df = pd.DataFrame(index=times, columns=["traintest"])
        df['traintest'] = np.random.randint(0, 3, len(df))
        self.df = df

class Random(TrainTestSet):
    """TrainTestSet based on a random sampling method."""
    n: int

class Ranstrat(TrainTestSet):
    """TrainTestSet based on a random stratified sampling method."""
    n: int

class split(TrainTestSet):
    """TrainTestSet based on a simple split method."""
    n: int

class TimeSeriesSplit(TrainTestSet):
    """TrainTestSet based on a "one-step-ahead" method."""
    n: int

class RepeatedKfold(TrainTestSet):
    """TrainTestSet based on a repeated k-fold with different randomizations."""
    n_repeats: int
    n_folds: int

In [4]:
tc = TimeConfig(start_year = '1979', end_year = '2021', freq='D')
ttset = LeaveOutN(tc, n=5)
ttset.get_training_labels()

Unnamed: 0,traintest
1979-01-03,2
1979-01-09,2
1979-01-16,2
1979-01-18,2
1979-01-19,2
...,...
2020-12-08,2
2020-12-14,2
2020-12-24,2
2020-12-29,2


In [None]:
class CrossValidator():
    """Perform cross-validation of training/testing data"""
    def __init__(self):