In [None]:
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm

# PanelSplit Demo

In this notebook I introduce PanelSplit, a cross validator for panel data.


### What is a cross validator?

- Check out the visualizations in the [0. CrossValidation Strategies](https://www.kaggle.com/code/tomwarrens/timeseriessplit-how-to-use-it/notebook#0.-CrossValidation-Strategies) section of this kaggle notebook to understand how a cross validator works. 

- The cross validator defines these folds (subsets of training data), and the train/test sets within each fold.

- They are often used with [hyper-parameter optimizers](https://scikit-learn.org/stable/modules/classes.html#hyper-parameter-optimizers). The hyper-parameter optimizer takes these folds and a set of hyper-parameters, and iterates through different hyperparameter combinations, evaluating each one's performance using the cross-validator's defined folds to determine the optimal set of hyperparameters for the model.


### How does time-series cross-validation work? Understanding [TimeSeriesSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html)


From the sklearn documentation:

>TimeSeriesSplit is a time-series cross validator. It provides train/test indices to split time series data samples that are observed at fixed time intervals, in train/test sets.
>
>In each split, test indices must be higher than before, and thus shuffling in cross validator is inappropriate."

Upon initialization, it takes the following parameters:

- n_splits: Number of splits
- gap: Number of samples to exclude from the end of each train set before the test set
- test_size: Used to limit the size of the test set
- max_train_size: Maximum size for a single training set.

In [None]:
# sample indices
x = list(range(6))
print(f'Sample indices:{x}')

In [None]:
# creating an instance of TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=3, gap=1, test_size=1).split(x)
print(f'TimeSeriesSplit.split() returns type {type(tss)}')

Calling TimeSeriesSplit.split() produces a 'generator' object. Upon its first usage, a list of the indices are produced. You can read more about generators [here](https://realpython.com/introduction-to-python-generators/), but the main point is that it yields a list, where each element within the list is a fold, and each fold contains train indices and test indices.

In [None]:
for i, (train_index, test_index) in enumerate(tss):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

Feel free to explore with different parameters (gap, n_splits, test_size, max_train_size) to understand how they work

In [None]:
# here is a function to visualize how time series split works.
def plot_time_series_splits(split_output):
    split_output = list(split_output)
    folds = len(split_output)
    fig, ax = plt.subplots()
    
    for i, (train_index, test_index) in enumerate(split_output):
        ax.scatter(train_index, [i] * len(train_index), color='blue', marker='.', s=50)
        ax.scatter(test_index, [i] * len(test_index), color='red', marker='.', s=50)

    ax.set_xlabel('Index')
    ax.set_ylabel('Fold')
    ax.set_title('TimeSeriesSplit Cross-Validation')
    ax.set_yticks(range(folds))  # Set the number of ticks on y-axis
    ax.set_yticklabels([f'{i}' for i in range(folds)])  # Set custom labels for y-axis
    plt.show()

In [None]:
x = list(range(10))
tss = list(TimeSeriesSplit(n_splits=3, gap=0, test_size=1, max_train_size=3).split(x))
plot_time_series_splits(tss)
print(tss)

Unfortunately while TimeSeriesSplit is useful for univariate time-series, it cannot be applied to panel data. 

It needs to be adapted for usage when there are multiple entities.

## PanelSplit

PanelSplit works as follows:

1. Create train and test indices for each fold by passing the period series to TimeSeriesSplit
2. For the train and test sets of each fold, substitute the with the corresponding period values
3. For each train and test periods of each fold, filter for the period values in the panel data's periods and return their indices. 

Here is what the cross validator looks like. Though the X, and y and groups arguments aren't used for split() and get_n_splits(), they are required  as arguments when using a hyper-parameter optimizer.

In [None]:
class PanelSplit:
    def __init__(self, unique_periods, train_periods, n_splits = 5, gap = None, test_size = None, max_train_size=None):
        """
        A class for performing time series cross-validation with custom train/test splits based on unique periods.

        Parameters:
        - n_splits: Number of splits for TimeSeriesSplit
        - gap: Gap between train and test sets in TimeSeriesSplit
        - test_size: Size of the test set in TimeSeriesSplit
        - unique_periods: Pandas DataFrame or Series containing unique periods
        - train_periods: All available training periods
        - max_train_size: Maximum size for a single training set.
        """
        self.tss = TimeSeriesSplit(n_splits=n_splits, gap=gap, test_size=test_size, max_train_size = max_train_size)
        indices = self.tss.split(unique_periods)
        self.u_periods_cv = self.split_unique_periods(indices, unique_periods)
        self.all_periods = train_periods
        self.n_splits = n_splits
        
    def split_unique_periods(self, indices, unique_periods):
        """
        Split unique periods into train/test sets based on TimeSeriesSplit indices.

        Parameters:
        - indices: TimeSeriesSplit indices
        - unique_periods: Pandas DataFrame or Series containing unique periods

        Returns: List of tuples containing train and test periods
        """
        u_periods_cv = []
        for i, (train_index, test_index) in enumerate(indices):
            unique_train_periods = unique_periods.iloc[train_index].values
            unique_test_periods = unique_periods.iloc[test_index].values
            u_periods_cv.append((unique_train_periods, unique_test_periods))
        return u_periods_cv

    def split(self, X = None, y = None, groups=None):
        """
        Generate train/test indices based on unique periods.
        """
        self.all_indices = []
        
        for i, (train_periods, test_periods) in enumerate(self.u_periods_cv):
            train_indices = self.all_periods.loc[self.all_periods.isin(train_periods)].index
            test_indices = self.all_periods.loc[self.all_periods.isin(test_periods)].index
            self.all_indices.append((train_indices, test_indices))
        
        return self.all_indices
   
    def get_n_splits(self, X=None, y =None, groups=None):
        """
        Returns: Number of splits
        """
        return self.n_splits


### Load example data

In [None]:
# Here we read in the data, filtering for a short period (the first half of 2015).
# We select two countries, Spain, France.
# We also select a few columns including indices, target variable and 2 features.
df = pd.read_csv('df_merged_light.csv'). \
    query('period < 201507 & period > 201412'). \
    query("isocode.isin(['ESP','FRA'])") \
    [['isocode','period','ons_armedconf_12','pr_topic_0', 'past_bestpc_12']]. \
    reset_index(drop=True)

display(df)

### Initializing PanelSplit

In [None]:
# define the unique period series
unique_sorted_periods = pd.Series(df.period.unique()).sort_values()

# initialize panel split
panel_split = PanelSplit(n_splits = 3, gap=1, test_size=1, 
                         unique_periods=unique_sorted_periods, train_periods=df.period)

print('Initalizing PanelSplit creates folds of periods. These are within self.u_periods_cv.')
for i, (train_periods, test_periods) in enumerate(panel_split.u_periods_cv):
    print(f"Fold {i}:")
    print(f"  Train: periods={train_periods}")
    print(f"  Test:  periods={test_periods}")

### split()

After we have substituted the period values for the indices in the PanelSplit initialization, the split() function filters for observations in the panel data that 
match the folds of the u_periods_cv list.

In [None]:
# specify the features and target
feature_cols = ['pr_topic_0', 'past_bestpc_12']; target_col = 'ons_armedconf_12'
features = df[feature_cols]; target = df[target_col]

panel_data_cv = PanelSplit(n_splits = 3, gap=1, test_size=1, unique_periods=unique_sorted_periods, train_periods=df.period). \
    split(features, target)

print('The split function returns panel data indices.')
print('First fold indices:')
print(panel_data_cv[0])

print('First fold train set:')
display(df.loc[panel_data_cv[0][0]])

print('First fold test set:')
display(df.loc[panel_data_cv[0][1]])

print('Second fold indices:')
print(panel_data_cv[1])

print('Second fold train set:')
display(df.loc[panel_data_cv[1][0]])

print('Second fold test set:')
display(df.loc[panel_data_cv[1][1]])

### Hyperparameter tuning with PanelSplit

Before doing hyperparameter tuning, I reset indices and drop NaN values with respect to both feature variables and the target. This usually saves me from indexing errors when working with PanelSplit.

In [None]:
df = pd.read_csv('df_merged_light.csv', engine='pyarrow'). \
    query('period < 201507 & period > 201412'). \
    query("isocode.isin(['ESP','FRA'])") \
    [['isocode','period','ons_armedconf_12','pr_topic_0', 'past_bestpc_12']]. \
    reset_index(drop=True)

feature_cols = ['pr_topic_0', 'past_bestpc_12']; target_col = 'ons_armedconf_12'

df = df.dropna(subset=feature_cols + [target_col]).reset_index(drop=True)
features = df[feature_cols]; target = df[target_col]
unique_sorted_periods = pd.Series(df.period.unique()).sort_values()

In [None]:
from sklearn.model_selection import GridSearchCV; from sklearn.ensemble import RandomForestClassifier

panel_split = PanelSplit(n_splits = 3, gap= 1, test_size= 1, unique_periods=unique_sorted_periods, train_periods=df.period)

param_grid = {'max_depth': [2, 3]}

param_search = GridSearchCV(estimator = RandomForestClassifier(), 
                            param_grid = param_grid,
                            cv=panel_split,
                            verbose = 3)

param_search.fit(features, target)

print(f'GridSearch results:')
display(pd.DataFrame(param_search.cv_results_))

Let's try a more practical example, setting the gap to 12, working with all countries up to 2016, and including more feature columns.

In [None]:
# read in the data:
df = pd.read_csv('df_merged_light.csv', engine='pyarrow').query('period < 201601')

feature_cols = ['pr_topic_0', 'pr_topic_1', 'pr_topic_2', 'population','since_armedconf', 'past_bestpc_6', 'past_bestpc_12']; target_col = 'ons_armedconf_12'
# dropping NaNs and resetting indices
train = df.dropna(subset= feature_cols + [target_col]).reset_index(drop=True)

features, target, train_periods = train[feature_cols], train[target_col], train.period

# generate a unique sorted period series.
unique_sorted_periods = pd.Series(train_periods.unique()).sort_values()

panel_split = PanelSplit(n_splits = 3, gap= 12, test_size= 1, unique_periods=unique_sorted_periods, train_periods=train_periods)

param_grid = {
    'max_depth': [3, 7],
    'min_samples_leaf':[10,100]
}

param_search = GridSearchCV(RandomForestClassifier(), 
                                param_grid,
                                scoring='roc_auc', 
                                cv=panel_split,
                                n_jobs=-1,
                                verbose = 1)
param_search.fit(features, target)

print('GridSearch results:')
display(pd.DataFrame(param_search.cv_results_).sort_values('rank_test_score'))

### Forecasting with PanelSplit
Once we have determined the best parameters using a hyper-parameter optimzer, PanelSplit can also be used for generating predictions.

In this case I create cross_val_predict to take a classifier and for each fold, train on the train indices and predict_proba on the test indices.

In [None]:
def cross_val_predict(estimator, X, y, indices, cv):
    """
    Perform cross-validated predictions using a given predictor model.

    Parameters:
    -----------
    estimator : The machine learning model used for prediction.

    X : pandas DataFrame
        The input features for the predictor.

    y : pandas Series
        The target variable to be predicted.

    indices : pandas DataFrame
        Indices corresponding to the dataset.

    cv : cross-validation generator
        A cross-validation splitting strategy.

    Returns:
    --------
    pd.DataFrame
        Concatenated DataFrame containing predictions made by the model during cross-validation.
        It includes the original indices joined with the predicted values.

    """
    predictions = []

    for train_index, test_index in tqdm(cv.split(X, y)):
        # first drop nas with respect to y_train
        y_train = y.iloc[train_index].dropna()
        # use y_train to filter for X_train
        X_train = X.iloc[y_train.index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index] 
        
        pred = indices.iloc[test_index].join(y_test)
        
        model = estimator.fit(X_train, y_train)
        
        pred[f'{y.name}_pred'] = model.predict_proba(X_test)[:, 1]
        
        predictions.append(pred)
    
    return pd.concat(predictions, axis=0)

cross_val_predict(estimator= RandomForestClassifier(**param_search.best_params_), X = features, y = target, indices = train[['isocode','period']], cv = panel_split)

Note that this cross_val_predict allows for target variables as NaN- you can drop NaN observations from the dataframe with respect to features, and then the function drops training observations where the target is NaN:

In [None]:
# read in the data, dropping NaNs only with respect to the feature columns.
train = pd.read_csv('df_merged_light.csv', engine='pyarrow'). \
    dropna(subset= feature_cols). \
        reset_index(drop=True)

features, target, train_periods = train[feature_cols], train[target_col], train.period

unique_sorted_periods = pd.Series(train_periods.unique()).sort_values()

panel_split = PanelSplit(n_splits = 3, gap= 12, test_size= 1, 
                         unique_periods=unique_sorted_periods, train_periods=train_periods)

cross_val_predict(RandomForestClassifier(**param_search.best_params_), features, target, train[['isocode','period']], panel_split)

### Customizing PanelSplit
PanelSplit can be further customized. Here I add a function to plot and visualize the time-series splits

In [None]:
class PanelSplit:
    def __init__(self, unique_periods, train_periods, n_splits = 5, gap = None, test_size = None, max_train_size=None, plot=False):
        """
        A class for performing time series cross-validation with custom train/test splits based on unique periods.

        Parameters:
        - n_splits: Number of splits for TimeSeriesSplit
        - gap: Gap between train and test sets in TimeSeriesSplit
        - test_size: Size of the test set in TimeSeriesSplit
        - unique_periods: Pandas DataFrame or Series containing unique periods
        - train_periods: All available training periods
        - max_train_size: Maximum size for a single training set.
        - plot: Flag to visualize time series splits
        """
        self.tss = TimeSeriesSplit(n_splits=n_splits, gap=gap, test_size=test_size, max_train_size = max_train_size)
        indices = self.tss.split(unique_periods)
        self.u_periods_cv = self.split_unique_periods(indices, unique_periods)
        self.all_periods = train_periods
        self.n_splits = n_splits
        if plot:
            self.plot_time_series_splits(self.u_periods_cv)
        
    def split_unique_periods(self, indices, unique_periods):
        """
        Split unique periods into train/test sets based on TimeSeriesSplit indices.

        Parameters:
        - indices: TimeSeriesSplit indices
        - unique_periods: Pandas DataFrame or Series containing unique periods

        Returns: List of tuples containing train and test periods
        """
        u_periods_cv = []
        for i, (train_index, test_index) in enumerate(indices):
            unique_train_periods = unique_periods.iloc[train_index].values
            unique_test_periods = unique_periods.iloc[test_index].values
            u_periods_cv.append((unique_train_periods, unique_test_periods))
        return u_periods_cv

    def split(self, X = None, y = None, groups=None):
        """
        Generate train/test indices based on unique periods.
        """
        self.all_indices = []
        
        for i, (train_periods, test_periods) in enumerate(self.u_periods_cv):
            train_indices = self.all_periods.loc[self.all_periods.isin(train_periods)].index
            test_indices = self.all_periods.loc[self.all_periods.isin(test_periods)].index
            self.all_indices.append((train_indices, test_indices))
        
        return self.all_indices
   
    def get_n_splits(self, X=None, y =None, groups=None):
        """
        Returns: Number of splits
        """
        return self.n_splits
    
    def plot_time_series_splits(self, split_output):
        """
        Visualize time series splits using a scatter plot.

        Parameters:
        - split_output: Output of time series splits
        """
        folds = len(split_output)
        fig, ax = plt.subplots()
        
        def int_to_dt(an_array):
            return pd.to_datetime(an_array.astype(str), format='%Y%m')

        for i, (train_index, test_index) in enumerate(split_output):
            ax.scatter(int_to_dt(train_index), [i] * len(train_index), color='blue', marker='.', s=50)
            ax.scatter(int_to_dt(test_index), [i] * len(test_index), color='red', marker='.', s=50)

        ax.set_xlabel('Periods')
        ax.set_ylabel('Folds')
        ax.set_title('Cross-Validation Splits')
        ax.grid(True)
        ax.set_yticks(range(folds))  # Set the number of ticks on y-axis
        ax.set_yticklabels([f'{i}' for i in range(folds)])  # Set custom labels for y-axi
        plt.show()


In [None]:
panel_split = PanelSplit(unique_sorted_periods, train_periods, n_splits=12, gap=3, test_size=1, plot=True, max_train_size=48)

In this version, I add:
- Return warnings about and omit one-class folds
- Working with different 'updated' values


In [None]:
class PanelSplit_updated:
    def __init__(self, n_splits, gap, test_size, unique_periods, train_periods, train_updated, drop_one_class_folds=False, X=None, y=None, plot=False, return_warning=True):
        """
        A class for performing time series cross-validation with custom train/test splits based on unique periods.

        Parameters:
        - n_splits: Number of splits for TimeSeriesSplit
        - gap: Gap between train and test sets in TimeSeriesSplit
        - test_size: Size of the test set in TimeSeriesSplit
        - unique_periods: Pandas DataFrame or Series containing unique periods
        - train_periods: All available training periods
        - drop_one_class_folds: Flag to drop folds with only one class in the test set
        - X: Input features
        - y: Target variable
        - plot: Flag to visualize time series splits
        - return_warning: Flag to return warning regarding one-class folds in the test set
        """
        self.tss = TimeSeriesSplit(n_splits=n_splits, gap=gap, test_size=test_size)
        indices = self.tss.split(unique_periods)
        self.u_periods_cv = self.split_unique_periods(indices, unique_periods)
        self.all_periods = train_periods; self.return_warning = return_warning; self.drop_one_class_folds = drop_one_class_folds; self.train_updated = train_updated
        self.n_splits = self.split(X, y, return_n_splits=True)
        
        if plot:
            self.plot_time_series_splits(self.u_periods_cv)
    
    def split_unique_periods(self, indices, unique_periods):
        """
        Split unique periods into train/test sets based on TimeSeriesSplit indices.

        Parameters:
        - indices: TimeSeriesSplit indices
        - unique_periods: Pandas DataFrame or Series containing unique periods

        Returns:
        - List of tuples containing train and test periods
        """
        u_periods_cv = []
        for i, (train_index, test_index) in enumerate(indices):
            unique_train_periods = unique_periods.iloc[train_index].values
            unique_test_periods = unique_periods.iloc[test_index].values
            u_periods_cv.append((unique_train_periods, unique_test_periods))
        return u_periods_cv

    def split(self, X, y, groups=None, return_n_splits=False):
        """
        Generate train/test indices based on unique periods and drop folds if specified.

        Parameters:
        - X: Input features
        - y: Target variable
        - groups: Group labels for the samples
        - return_n_splits: Flag to return the number of splits

        Returns:
        - List of tuples containing train and test indices or number of splits
        """
        self.all_indices = []

        for i, (train_periods, test_periods) in enumerate(self.u_periods_cv):
            max_period = test_periods.max() if test_periods.max() > 201001 else 201001
            train_selection = (self.all_periods.isin(train_periods)  & (self.train_updated == max_period))
            test_selection = (self.all_periods.isin(test_periods) & (self.train_updated == max_period))
                
            train_indices = X.loc[train_selection].index
            test_indices = y.loc[test_selection].index
            
            append_indices = self.check_classes_in_test(y, test_indices, test_periods, i, return_n_splits)
            if append_indices:
                self.all_indices.append((train_indices, test_indices))

        if return_n_splits:
            return len(self.all_indices)
        else:
            return self.all_indices

    def check_classes_in_test(self, y, test_indices, test_periods, i, return_n_splits):
        """
        Check for the existence of a single class in the test set and handle accordingly.

        Parameters:
        - y: Target variable
        - test_indices: Indices of the test set
        - test_periods: Periods in the test set
        - i: Fold index
        - return_n_splits: Flag to return the number of splits
        - return_warning: Flag to return warning regarding one-class folds in the test set

        Returns:
        - Boolean indicating whether to append indices or not
        """
        # Check for existence of 1 in the test set
        one_class = (y.loc[test_indices].sum() == 0)
        if one_class and not self.drop_one_class_folds:
            if return_n_splits:
                if self.return_warning:
                    print(f'''Warning: Fold {i} has no 1s in the test set, so it cannot compute ROC AUC.
The period for this test set is {test_periods}''')
            return True
        elif one_class and self.drop_one_class_folds:
            if return_n_splits:
                print(f'Fold {i} has only one class in the test set (period {test_periods}). Omitting fold {i}.')
            return False
        else:
            return True
            
    def plot_time_series_splits(self, split_output):
        """
        Visualize time series splits using a scatter plot.

        Parameters:
        - split_output: Output of time series splits
        """
        folds = len(split_output)
        fig, ax = plt.subplots()
        
        def int_to_dt(an_array):
            return pd.to_datetime(an_array.astype(str), format='%Y%m')

        for i, (train_index, test_index) in enumerate(split_output):
            ax.scatter(int_to_dt(train_index), [i] * len(train_index), color='blue', marker='.', s=50)
            ax.scatter(int_to_dt(test_index), [i] * len(test_index), color='red', marker='.', s=50)

        ax.set_xlabel('Periods')
        ax.set_ylabel('Folds')
        ax.set_title('Cross-Validation Splits')
        ax.grid(True)
        ax.set_yticks(range(folds))  # Set the number of ticks on y-axis
        ax.set_yticklabels([f'{i}' for i in range(folds)])  # Set custom labels for y-axi
        plt.show()
