In [None]:
import pandas as pd
import numpy as np

# Sample Weights

In [None]:
def mpNumCoEvents(closeIdx,t1,molecule):
    '''
    Compute the number of concurrent events per bar.
    +molecule[0] is the date of the first event on which the weight will be computed
    +molecule[-1] is the date of the last event on which the weight will be computed
    Any event that starts before t1[molecule].max() impacts the count.
    '''
    #1) find events that span the period [molecule[0],molecule[-1]]
    t1 = t1.fillna(closeIdx[-1]) # unclosed events still must impact other weights
    t1 = t1[t1>=molecule[0]] # events that end at or after molecule[0]
    t1 = t1.loc[:t1[molecule].max()] # events that start at or before t1[molecule].max()
    #2) count events spanning a bar
    iloc = closeIdx.searchsorted(np.array([t1.index[0],t1.max()]))
    count = pd.Series(0, index=closeIdx[iloc[0]:iloc[1]+1], dtype='float64')
    for tIn, tOut in t1.items(): count.loc[tIn:tOut] += 1.
    return count.loc[molecule[0]:t1[molecule].max()]

In [None]:
def mpSampleTW(t1,numCoEvents,molecule):
    # Derive average uniqueness over the event's lifespan
    wght=pd.Series(index=molecule, dtype='float64')
    for tIn,tOut in t1.loc[wght.index].items():
        wght.loc[tIn]=(1./numCoEvents.loc[tIn:tOut]).mean()
    return wght


## Example Code for Average Uniqueness

In [None]:
import plotly.graph_objects as go
import plotly.express as px

In [None]:
def generate_test_data(obs, seed=1):
    np.random.seed(seed)
    returns = pd.Series(np.random.normal(0.002, 0.1, obs), index=pd.date_range('2015-01-02',periods=obs))
    close = returns.add(1).cumprod()
    close.loc[pd.to_datetime('2015-01-01')] = 1
    close = close.sort_index()
    return close

In [None]:
def getDailyVol(close,span0=100):
    # daily vol, reindexed to close
    df0 = close.index.searchsorted(close.index-pd.Timedelta(days=1))
    df0 = df0[df0>0]
    df0 = pd.Series(close.index[df0-1], index=close.index[close.shape[0]-df0.shape[0]:])
    df0 = close.loc[df0.index]/close.loc[df0.values].values-1 # daily returns
    df0 = df0.ewm(span=span0).std()
    return df0

In [None]:
def applyPtSlOnT1(close, events, ptSl, molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events_ = events.loc[molecule]
    out = events_[['t1']].copy(deep=True)

    if ptSl[0] > 0:pt = ptSl[0] * events_['trgt']
    else: pt = pd.Series(index=events.index, dtype=float) # NaNs

    if ptSl[1] > 0:sl = -ptSl[1] * events_['trgt']
    else: sl = pd.Series(index=events.index, dtype=float) # NaNs

    for loc, t1 in events_['t1'].fillna(close.index[-1]).items():
        df0 = close[loc:t1] # path prices
        
        df0 = (df0/close[loc]-1) * events_.at[loc,'side'] # path returns, need to be ommited for interest rates

        out.loc[loc,'sl'] = df0[df0<sl[loc]].index.min() # earliest stop loss.
        out.loc[loc,'pt'] = df0[df0>pt[loc]].index.min() # earliest profit taking.

    return out

In [None]:
from mpcode import mpPandasObj

def getEvents(close: pd.Series, tEvents: pd.Series, ptSl, trgt: pd.Series, minRet: float, numThreads: int, t1=False):
    
    #1) get target
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt>minRet] # minRet

    #2) get t1 (max holding period)
    if t1 is False: t1 = pd.Series(pd.NaT, index=tEvents)

    #3) form events object, apply stop loss on t1
    side_ = pd.Series(1.,index=trgt.index) # arbitrarily set side to 1, since it is not needed to learn the side

    events = pd.concat({'t1':t1,'trgt':trgt,'side':side_}, axis=1).dropna(subset=['trgt'])

    # df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule',events.index), numThreads=numThreads, close=close, events=events, ptSl=[ptSl,ptSl]) # [ptSl, ptSl] does not work
    df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule',events.index), numThreads=numThreads, close=close, events=events, ptSl=ptSl)

    # get timestamps of earliest hit of either stop loss or profit taking 
    events['t1'] = df0.dropna(how='all').min(axis=1) # pd.min ignores nan
    events = events.drop('side',axis=1)
    return events

In [None]:
close = generate_test_data(1000)
trgt = getDailyVol(close)
events = getEvents(close, trgt.index, ptSl=[1,1], trgt=trgt, minRet=0.00, numThreads=16, t1=False)

In [None]:
from mpcode import mpPandasObj

numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', events.index), numThreads=12, closeIdx=close.index, t1=events['t1'])
numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep='last')]
numCoEvents = numCoEvents.reindex(close.index).fillna(0)
out = pd.DataFrame()
out['tW'] = mpPandasObj(mpSampleTW, ('molecule',events.index), numThreads=12, t1=events['t1'], numCoEvents=numCoEvents)

In [None]:
out.head()

In [None]:
# Average uniqueness used for maxSamples in Classifiers
avgU = out['tW'].mean()
avgU

In [None]:
from plotly.subplots import make_subplots

fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=close.index, y=close, mode='lines', name='close'), secondary_y=True)
fig.add_trace(go.Bar(x=out.index, y=out['tW'], name='avg label uniqueness'), secondary_y=False)
fig.update_layout(title='Average Label Uniqueness', xaxis_title='Date', yaxis_title='Price')
fig.show()


In [None]:
# plot histogram of avg label uniqueness
fig = px.histogram(out, x="tW", nbins=100, title='Histogram of Uniqueness Values')
fig.update_layout(xaxis_title='Avg Label Uniqueness', yaxis_title='Count', width=800, height=500)
fig.show()


## Remarks on the method

Computing the average uniqueness associated with label $i$, $\overline u_i$ requires information that is not available until a future time, `events['t1']`. This is not a problem, because $\{\overline u_i\}_{i=1,…,I }$ are used on the training set in combination with label information, and not on the testing set. These $\{\overline u_i\}_{i=1,…,I }$ are not used for forecasting the label, hence there is no information leakage. This procedure allows us to assign a uniqueness score between 0 and 1 for each observed feature, in terms of non-overlapping outcomes.