In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# only gives realistic (and good to visualize) close price for obs <= 000
def generate_test_data(obs, seed=1):
    np.random.seed(seed)
    returns = pd.DataFrame(np.random.normal(0.002, 0.1, obs), index=pd.date_range('2015-01-02',periods=obs))
    close = returns.add(1).cumprod()
    close.loc[pd.to_datetime('2015-01-01')] = 1
    close = close.sort_index()
    close.rename(columns={0:'Close'}, inplace=True)
    return close

# Purging

One way to reduce leakage is to purge from the training set all observations whose labels overlapped in time with those labels included in the testing set.

A label $Y_i = f[[t_{i,0}, t_{i,1}]]$ overlaps with $Y_j$ if any of the three sufficient conditions is met:

1. $t_{j,0} \leq t_{i,0} \leq t_{j,1} $
2. $t_{j,0} \leq t_{i,1} \leq t_{j,1} $
3. $t_{i,0} \leq t_{j,0} \leq t_{j,1} \leq t_{i,1} $

In [None]:
def getTrainTimes(t1: pd.Series, testTimes):
    '''
    Given testTimes, find the times of the training observations.
    — t1.index: Time when the observation started.
    — t1.value: Time when the observation ended.
    — testTimes: Times of testing observations.
    '''
    trn = t1.copy(deep=True)
    for i,j in testTimes.items():
        df0 = trn[(i<=trn.index)&(trn.index<=j)].index # train starts within test
        df1 = trn[(i<=trn)&(trn<=j)].index # train ends within test
        df2 = trn[(trn.index<=i)&(j<=trn)].index # train envelops test
        trn = trn.drop(df0.union(df1).union(df2))
    return trn

Example Code

In [None]:
d = generate_test_data(1000)

In [None]:
days = 30
l = d.index[:-days] + pd.Timedelta(days=days) # index of labels, which are derived from observations that lay x days ahead
t1 = pd.Series(l, index=d.index[:-days]) # index: when observation started, value: when observation ended
testfrac = 0.2
randTestStart = np.random.choice(t1.index[:-int(d.shape[0]*testfrac)])
testPeriod = pd.Series({
    randTestStart : randTestStart + pd.Timedelta(days=int(d.shape[0]*testfrac))
    })
trainTimes = getTrainTimes(t1, testPeriod)
trainEnd = max(trainTimes.index[testPeriod.index[0] >= trainTimes.index])

Plot

In [None]:
top = np.max(d.values)
fig = go.Figure()
fig.add_trace(go.Scatter(x=d.index, y=d.Close, name='All Samples'))

# lines
fig.add_shape(x0=testPeriod.index[-1], x1=testPeriod.index[-1], y0=0, y1=top, line=dict(color='black', width=1, dash='dot'))
fig.add_annotation(x=testPeriod.index[-1], y=top, text='Test Start', showarrow=False, yshift=10)

fig.add_shape(x0=testPeriod.iloc[-1], x1=testPeriod.iloc[-1], y0=0, y1=top, line=dict(color='black', width=1, dash='dot'))
fig.add_annotation(x=testPeriod.iloc[-1], y=top, text='Test End', showarrow=False, yshift=10)

fig.add_shape(x0=trainEnd, x1=trainEnd, y0=0, y1=top, line=dict(color='black', width=1, dash='dot'))
fig.add_annotation(x=trainEnd, y=top, text='Train End', showarrow=False, yshift=10)

# train set
fig.add_trace(go.Scatter(x=trainTimes.index, y=d.loc[trainTimes.index].Close, name='Train Samples', mode='markers', marker=dict(color='black', size=4)))

fig.update_layout(
    title=f'Train and Test Split with Purging (In this Example all labels are derived from future observations that lay {days} days ahead)',
    xaxis_title='Date',
    yaxis_title='Close',
    width=1400,
    height=500,
    xaxis_rangeslider_visible=True,
    legend_title='Set',
    font=dict(
        # family="Arial",
        size=12,
        color="Black"
    )
)

fig.show()


# Purged K Fold Class

In [None]:
from sklearn.model_selection._split import KFold

In [None]:
class PurgedKFold(KFold):
    '''
    Extend KFold to work with labels that span intervals
    The train is purged of observations overlapping test-label intervals
    Test set is assumed contiguous (shuffle=False), w/o training examples in between
    '''
    def __init__(self,n_splits=3,t1=None,pctEmbargo=0.):
        if not isinstance(t1,pd.Series):
            raise ValueError('Label Through Dates must be a pandas series')
        super(PurgedKFold,self).__init__(n_splits,shuffle=False,random_state=None)
        self.t1=t1
        self.pctEmbargo=pctEmbargo

    def split(self,X,y=None,groups=None):
        if (X.index==self.t1.index).sum()!=len(self.t1):
            raise ValueError('X and ThruDateValues must have the same index')
        indices=np.arange(X.shape[0])
        mbrg=int(X.shape[0]*self.pctEmbargo)
        test_starts=[(i[0],i[-1]+1) for i in np.array_split(np.arange(X.shape[0]),self.n_splits)]
        for i,j in test_starts:
            t0=self.t1.index[i] # start of test set
            test_indices=indices[i:j]
            maxT1Idx=self.t1.index.searchsorted(self.t1[test_indices].max())
            train_indices=self.t1.index.searchsorted(self.t1[self.t1<=t0].index)
            train_indices=np.concatenate((train_indices,indices[maxT1Idx+mbrg:]))
            yield train_indices,test_indices

In [None]:
X = generate_test_data(1000)

In [None]:
from sklearn.model_selection._split import KFold
cvGen = KFold()

for i,(train,test) in enumerate(cvGen.split(X=X)):
    print(f'Fold {i} | Train: {train} | Test: {test}')

# Adjusted CV Score

In [None]:
def cvScore(clf,X,y,sample_weight,scoring='neg_log_loss',t1=None,cv=None,cvGen=None, pctEmbargo=None):
    if scoring not in ['neg_log_loss','accuracy']:
        raise Exception('wrong scoring method.')
    
    from sklearn.metrics import log_loss,accuracy_score
    # from clfSequential import PurgedKFold

    if cvGen is None:
        cvGen=PurgedKFold(n_splits=cv,t1=t1,pctEmbargo=pctEmbargo) # purged
    score=[]
    for train,test in cvGen.split(X=X):
        fit=clf.fit(X=X.iloc[train,:],y=y.iloc[train],
        sample_weight=sample_weight.iloc[train].values)
        if scoring=='neg_log_loss':
            prob=fit.predict_proba(X.iloc[test,:])
            score_=-log_loss(y.iloc[test],prob, sample_weight=sample_weight.iloc[test].values,labels=clf.classes_)
        else:
            pred=fit.predict(X.iloc[test,:])
            score_=accuracy_score(y.iloc[test],pred,sample_weight = sample_weight.iloc[test].values)
        score.append(score_)
    return np.array(score)