<a href="https://colab.research.google.com/github/2020147544/Advances_in_Financial_Engineering/blob/main/chapter3%2C4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def getDailyVol(close,span0=100):
    '''
    daily vol, reindexed to close
    - used to set default profit taking and stop-loss limits
    '''
    df0=close.index.searchsorted(close.index-pd.Timedelta(days=1))
    df0=df0[df0>0]
    df0=pd.Series(close.index[df0–1], index=close.index[close.shape[0]-df0.shape[0]:])
    df0=close.loc[df0.index]/close.loc[df0.values].values-1 # daily returns
    df0=df0.ewm(span=span0).std()
    return df0


def applyPtSlOnT1(close,events,ptSl,molecule):
    
    '''
    Tripple-barrier labeling method
    ` Apply stop loss/profit taking, if it takes place before t1 (end of event)

    Input: 
    ` events: 
      — t1: the timestamp of vertical barrier
      — trgt: the unit width of the horizontal barriers, expressed in terms of absolute returns
    ` ptsl: a list of two non-negative float values
      - ptsl[0]: the factor multiplies trgt to set the width of the upper barrier
      - ptsl[1]: the factor that multiplies trgt to set the width of the lower barrier
    ` molecule: a list with the subset of event indices 

    Output: a Dataframe containing the timestamps at which each barrier was touched, [pt, s1, t1]
    ` 0 (inactive barrier) or 1 (active barrier)

    '''
    events_=events.loc[molecule]
    out=events_[['t1']].copy(deep=True)46 LABELING
    if ptSl[0]>0:pt=ptSl[0]*events_['trgt']
    else:pt=pd.Series(index=events.index) # NaNs
    if ptSl[1]>0:sl=-ptSl[1]*events_['trgt']
    else:sl=pd.Series(index=events.index) # NaNs
    for loc,t1 in events_['t1'].fillna(close.index[-1]).iteritems():
        df0=close[loc:t1] # path prices
        df0=(df0/close[loc]-1)*events_.at[loc,'side'] # path returns
        out.loc[loc,'sl']=df0[df0<sl[loc]].index.min() # earliest stop loss.
        out.loc[loc,'pt']=df0[df0>pt[loc]].index.min() # earliest profit taking.
    return out



def getEvents(close,tEvents,ptSl,trgt,minRet,numThreads,t1=False,side=None):
    '''
    Getting the time of first touch
     
    Input:
    ` tEvents: the pandas timeindex containing the timestamps that will seed every triple barrier 
      - the timestamps selected by the sampling procedures 
    ` minRet: the minimum target return required for running a triple barrier search
    ` numThreads: the number of threads concurrently used by the function  

    Output: 
    ` events: a Dataframe
      - events.index: event's starttime
      - events['t1']: event's endtime
      - events['trgt']: event's target
      - events['side'] (optional): the algo's position side
    '''
    #1) get target
    trgt=trgt.loc[tEvents]
    trgt=trgt[trgt>minRet] # minRet
    #2) get t1 (max holding period)
    if t1 is False:t1=pd.Series(pd.NaT,index=tEvents)
    #3) form events object, apply stop loss on t1
    if side is None:side_,ptSl_=pd.Series(1.,index=trgt.index),[ptSl[0],ptSl[0]]
    else:side_,ptSl_=side.loc[trgt.index],ptSl[:2]
    events=pd.concat({'t1':t1,'trgt':trgt,'side':side_}, axis=1).dropna(subset=['trgt'])
    df0=mpPandasObj(func=applyPtSlOnT1,pdObj=('molecule',events.index), numThreads=numThreads,close=inst['Close'],events=events,ptSl=ptSl_)
    events['t1']=df0.dropna(how='all').min(axis=1) # pd.min ignores nan
    if side is None:events=events.drop('side',axis=1)
    return events



def getBins(events,close):
   '''
   Labeling for side and size
   Input:
   ` events: a Dataframe
         
   Output:
   ` ret: the return realized at the time of the first touched barrier
   ` bin: the label as a function of the sign of the outcome 
     - Case 1: ('side' not in events): bin in (-1,1) <- label by price action (standard labeling)
     - Case 2: ('side' in events): bin in (0,1), i.e. pass or bet <- label by pnl (meta-labeling) 
   '''
    #1) prices aligned with events
    events_=events.dropna(subset=['t1'])
    px=events_.index.union(events_['t1'].values).drop_duplicates()  
    px=close.reindex(px,method='bfill')50 LABELING
    #2) create out object
    out=pd.DataFrame(index=events_.index)
    out['ret']=px.loc[events_['t1'].values].values/px.loc[events_.index]-1
    out['bin']=np.sign(out['ret'])  
    return out

  
def dropLabels(events,minPtc=.05):
    '''
    Apply weights, drop labels with insufficient examples
    '''
    while True:
        df0=events['bin'].value_counts(normalize=True)
        if df0.min()>minPct or df0.shape[0]<3:break
        print 'dropped label',df0.argmin(),df0.min()
        events=events[events['bin']!=df0.argmin()]
    return events





def mpNumCoEvents(closeIdx,t1,molecule):
    '''
    Compute the number of concurrent events per bar.
    
    Input:
    ` molecule[0]: the date of the first event on which the weight will be computed
    ` molecule[-1]: the date of the last event on which the weight will be computed
    Any event that starts before t1[molecule].max() impacts the count.
    '''
    #1) find events that span the period [molecule[0],molecule[-1]]
    t1=t1.fillna(closeIdx[-1]) # unclosed events still must impact other weights
    t1=t1[t1>=molecule[0]] # events that end at or after molecule[0]
    t1=t1.loc[:t1[molecule].max()] # events that start at or before t1[molecule].max()
    #2) count events spanning a bar
    iloc=closeIdx.searchsorted(np.array([t1.index[0],t1.max()]))
    count=pd.Series(0,index=closeIdx[iloc[0]:iloc[1]+1])
    for tIn,tOut in t1.iteritems():count.loc[tIn:tOut]+=1.
    return count.loc[molecule[0]:t1[molecule].max()]


def mpSampleTW(t1,numCoEvents,molecule):
    # Derive average uniqueness over the event's lifespan
    wght=pd.Series(index=molecule)
    for tIn,tOut in t1.loc[wght.index].iteritems():
    wght.loc[tIn]=(1./numCoEvents.loc[tIn:tOut]).mean()
    return wght


def getIndMatrix(barIx,t1):
    '''
    Get indicator matrix

    Input:
    ` barIx: the index of bars
    ` t1: a pd Series defined by
      - an index containing the time at which the feature are observed
      - a values array containing the time at which the label is determined

    Output: a binary matrix indicating what bars influence the label for each observation
    '''
    indM=pd.DataFrame(0,index=barIx,columns=range(t1.shape[0]))
    for i,(t0,t1) in enumerate(t1.iteritems()):indM.loc[t0:t1,i]=1.
    return indM



def getAvgUniqueness(indM):
    # Average uniqueness from indicator matrix
     c=indM.sum(axis=1) # concurrency
    u=indM.div(c,axis=0) # uniqueness
    avgU=u[u>0].mean() # average uniqueness
    return avgU


def seqBootstrap(indM,sLength=None):
    '''
    Generate a sample via sequential bootstrap
    
    Input:
    ` indM
    ` sLength: an optional sample length with a default value of as many draws as rows in indM

    Output: the index of the features sampled by sequential bootstrap 
    '''
    if sLength is None:sLength=indM.shape[1]  
    phi=[]
    while len(phi)<sLength:
      avgU=pd.Series()
      for i in indM:
          indM_=indM[phi+[i]] # reduce indM
          avgU.loc[i]=getAvgUniqueness(indM_).iloc[-1]
      prob=avgU/avgU.sum() # draw prob
      phi+=[np.random.choice(indM.columns,p=prob)]
    return phi



def mpSampleW(t1,numCoEvents,close,molecule):
    # Derive sample weight by return attribution
    ret=np.log(close).diff() # log-returns, so that they are additive
    wght=pd.Series(index=molecule)
    for tIn,tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn]=(ret.loc[tIn:tOut]/numCoEvents.loc[tIn:tOut]).sum()
    return wght.abs()


def getTimeDecay(tW,clfLastW=1.):
    # apply piecewise-linear decay to observed uniqueness (tW)
    # newest observation gets weight=1, oldest observation gets weight=clfLastW
    clfW=tW.sort_index().cumsum()
    if clfLastW>=0:slope=(1.-clfLastW)/clfW.iloc[-1]
    else:slope=1./((clfLastW+1)*clfW.iloc[-1])
    const=1.-slope*clfW.iloc[-1]
    clfW=const+slope*clfW
    clfW[clfW<0]=0
    print const,slope
    return clfW