In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [None]:
def generate_test_data(obs, seed=1):
    np.random.seed(seed)
    returns = pd.DataFrame(np.random.normal(0.002, 0.1, obs), index=pd.date_range('2015-01-02',periods=obs))
    close = returns.add(1).cumprod()
    close.loc[pd.to_datetime('2015-01-01')] = 1
    close = close.sort_index()
    close.rename(columns={0:'Close'}, inplace=True)
    return close

# Structural Break Tests

## Supremum Augmented Dickey-Fuller (SADF) (Explosiveness Test)

fitting the regression specification $\Delta y_t = \alpha + \beta y_{t-1} + \sum^L_{l=1} \gamma _l \Delta y_{t-l} + \epsilon _t$

where we test for

$H_0: \beta \leq 0 $

$ H_1: \beta \geq 0$

Supremum Augmented Dickey-Fuller (SADF) fits regression at each end point t with backwards expanding start points, then computes

$$\text{SADF} =  \sup_{t_0 \in [1, t-\tau]}{ADF_{t_0,t}} = \sup_{t_0 \in [1, t-\tau]} \biggl\{ \frac{\^\beta_{t_0,t}}{\^\sigma_{\beta_{t_0,t}}} \biggl\}$$

In [None]:
def getBetas(y,x):
    xy=np.dot(x.T,y)
    xx=np.dot(x.T,x)
    # print(np.linalg.det(xx))
    xxinv=np.linalg.inv(xx)
    bMean=np.dot(xxinv,xy)
    err=y-np.dot(x,bMean)
    bVar=np.dot(err.T,err)/(x.shape[0]-x.shape[1])*xxinv
    return bMean,bVar

In [None]:
def lagDF(df0,lags):
    df1=pd.DataFrame()
    if isinstance(lags,int):lags=range(lags+1)
    else:lags=[int(lag) for lag in lags]
    for lag in lags:
        df_=df0.shift(lag).copy(deep=True)
        df_.columns=[str(i)+'_'+str(lag) for i in df_.columns]
        df1=df1.join(df_,how='outer')
    return df1

In [None]:
def getYX(series,constant,lags):
    series_=series.diff().dropna()
    x=lagDF(series_,lags).dropna()
    x.iloc[:,0]=series.values[-x.shape[0]-1:-1,0] # lagged level
    y=series_.iloc[-x.shape[0]:].values
    if constant!='nc':
        x=np.append(x,np.ones((x.shape[0],1)),axis=1)
        if constant[:2]=='ct':
            trend=np.arange(x.shape[0]).reshape(-1,1)
            x=np.append(x,trend,axis=1)
        if constant=='ctt':
            x=np.append(x,trend**2,axis=1)
    return y,x

In [None]:
# inner loop

def get_bsadf(logP, minSL, constant, lags):

    y,x = getYX(logP,constant=constant,lags=lags)

    startPoints, allADF = range(0,y.shape[0]+lags-minSL+1), []

    # bsadf = None
    bsadf = -np.inf
    
    for start in startPoints:
        y_,x_ = y[start:], x[start:]
        bMean_,bStd_ = getBetas(y_,x_)
        bMean_,bStd_ = bMean_[0,0],bStd_[0,0]**.5
        allADF.append(bMean_/bStd_)

        if allADF[-1] > bsadf: bsadf = allADF[-1]

    out = {'Time':logP.index[-1],'gsadf':bsadf}

    return out

In [None]:
data = generate_test_data(1000)
logP = np.log(data)

logP.head() 

minSL,constant,lags = 10, 'nc', 3

sadf = pd.DataFrame(columns=['Time','gsadf'])

for t in logP.index:
    out = get_bsadf(logP.loc[:t],minSL,constant,lags)
    sadf.loc[t] = out

In [None]:
sadf.tail()

In [None]:
from plotly.subplots import make_subplots

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=sadf['Time'], y=sadf['gsadf'], mode='lines', name='sadf'), secondary_y=True)
fig.add_trace(go.Scatter(x=data.index, y=data.values.flatten(), mode='lines', name='close'))

fig.update_layout(title='SADF and Close Price over Time', xaxis_title='Time')

fig.show()

## The symmetric CUSUM Filter (Event-Based Sampling)

CUSUM tests: These test whether the cumulative forecasting errors significantly deviate from white noise

The CUSUM Filter can be applied in the context of event-based sampling of bars.

IID observations $\{y_t\}_{t=1,...,T}$

Define cumulative sums $S_t = max\{0, S_{t-1} + y_t - E_{t-1}[y_t]\}$

This zero floor means that we will skip some downward deviations that otherwise would make S t negative. The reason is, the filter is set up to identify a sequence of upside divergences from any reset level zero. In particular, the threshold is activated when:

$S_t \geq h \Leftrightarrow \exists \tau \in [1, t] \text{ for } \sum^t_{i=\tau}(y_i - E_{i-1}[y_t]) \geq h$

This concept of run-ups can be extended to include run-downs, giving us a symmetric CUSUM filter:

$S^+_t = \text{max}\{0, S^+_{t-1} + y_t - E_{t-1}[y_t]\}, S^+_t = 0$

$S^-_t = \text{min}\{0, S^-_{t-1} + y_t - E_{t-1}[y_t]\}, S^-_t = 0$


Variable ${S_t}$ could be based on any of the features, like structural break statistics, entropy, or market microstructure measurements. For example, we could declare an event whenever SADF (Supreme Augmented Dickey-Fuller) departs sufficiently from a previous reset level. Once we have obtained this subset of event-driven bars, we will let the ML algorithm determine whether the occurrence of such events constitutes actionable intelligence.

In [None]:
def getTEvents(gRaw,h):
    tEvents, sPos, sNeg = [], 0, 0
    diff = gRaw.diff()
    for i in diff.index[1:]:
        sPos, sNeg = max(0, sPos+diff.loc[i]), min(0, sNeg+diff.loc[i])
        if sNeg < -h:
            sNeg = 0
            tEvents.append(i)
        elif sPos > h:
            sPos=0
            tEvents.append(i)
    return pd.DatetimeIndex(tEvents)

In [None]:

tEvents = getTEvents(sadf['gsadf'], 1)

In [None]:
from plotly.subplots import make_subplots

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])

# fig.add_trace(go.Scatter(x=data.index, y=data['Close'], name='Close'))

fig.add_trace(go.Scatter(x=sadf['Time'], y=sadf['gsadf'], mode='lines', name='sadf'))

fig.add_trace(go.Scatter(x=tEvents, y=sadf.loc[tEvents]['gsadf'], mode='markers', name='tEvents'))
fig.add_trace(go.Scatter(x=tEvents, y=sadf.loc[tEvents]['gsadf'], name='tEvents Line'))

fig.update_layout(title='CUSUM Filter on SADF', xaxis_title='Time')
fig.show()