## Ipython Notebooks For Triple-Bar

Add the package path for testing

In [1]:
import sys
sys.path.append('E:\\git_folder\\quant_models')

In [2]:
import pandas as pd
from util.utils import get_daily_vol, sample_df
data_fn = "././data/xu1_1min.csv"
df_price = pd.read_csv(data_fn)
test_df = df_price.copy()
test_df.loc[:,'datetime'] = pd.to_datetime(test_df.datetime)
test_df.set_index('datetime', inplace=True)
df_testprice = test_df.loc[test_df.index > pd.to_datetime('2019-10-01')]
df_sampled = sample_df(df_testprice, '10T')

## Start Testing

In [3]:
df_sampled.tail(2)

Unnamed: 0_level_0,open,close,low,high,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-12-18 13:50:00,14295.0,14265.0,14265.0,14302.5,3888.0
2019-12-18 14:00:00,14262.5,14247.5,14245.0,14262.5,3194.0


In [4]:
timedelta = pd.Timedelta('2 hours')
nearest_index = df_sampled.index.searchsorted(df_sampled.index + timedelta)

In [5]:
nearest_index

array([  12,   13,   14, ..., 6736, 6736, 6736], dtype=int64)

In [6]:
def add_vertical_barrier(t_events, close, num_days=0, num_hours=0, num_minutes=0, num_seconds=0):
    """
    From AFML, Try to add a Vertical Barrier

    For each index in t_events, it finds the timestamp of the next price bar at or immediately after
    a number of days num_days. This vertical barrier can be passed as an optional argument t1 in get_events.

    This function creates a series that has all the timestamps of when the vertical barrier would be reached.

    :args
    1. t_events: DatetimeIndex
    2. df_price: the price dataframe which has the columns
        datetime as the index
        another column is can be price
    3. Time diff parameters:
        num_days: (int) number of days to add for vertical barrier
        num_hours: (int) number of hours to add for vertical barrier
        num_minutes: (int) number of minutes to add for vertical barrier
        num_seconds: (int) number of seconds to add for vertical barrier'

    :return:
    (series) timestamps of vertical barriers
    """
    timedelta = pd.Timedelta(
        '{} days, {} hours, {} minutes, {} seconds'.format(num_days, num_hours, num_minutes, num_seconds))
    # Find index to closest to vertical barrier
    nearest_index = close.index.searchsorted(t_events + timedelta)

    # Exclude indexes which are outside the range of close price index
    nearest_index = nearest_index[nearest_index < close.shape[0]]

    # Find price index closest to vertical barrier time stamp
    nearest_timestamp = close.index[nearest_index]
    filtered_events = t_events[:nearest_index.shape[0]]

    vertical_barriers = pd.Series(data=nearest_timestamp, index=filtered_events)
    return vertical_barriers

In [7]:
vertical_barrier_times = add_vertical_barrier(df_sampled.index, df_sampled, num_hours=1)

In [8]:
vertical_barrier_times.tail()

datetime
2019-12-18 12:20:00   2019-12-18 13:20:00
2019-12-18 12:30:00   2019-12-18 13:30:00
2019-12-18 12:40:00   2019-12-18 13:40:00
2019-12-18 12:50:00   2019-12-18 13:50:00
2019-12-18 13:00:00   2019-12-18 14:00:00
Name: datetime, dtype: datetime64[ns]

In [9]:
target_vert = get_daily_vol(df_sampled)

In [10]:
target = target_vert['vol']
t_events = df_sampled.index

In [11]:
target = target.loc[t_events]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


In [12]:
target_vert.head()

Unnamed: 0_level_0,vol
datetime,Unnamed: 1_level_1
2019-10-02 00:20:00,0.00013
2019-10-02 00:30:00,0.000184
2019-10-02 00:40:00,0.000489
2019-10-02 00:50:00,0.000426
2019-10-02 01:00:00,0.000394


In [13]:
targetv1 = target.reindex(t_events)

In [14]:
targetv1.dropna(inplace=True)

In [15]:
target = targetv1.copy()

In [16]:
target.head()

datetime
2019-10-02 00:20:00    0.000130
2019-10-02 00:30:00    0.000184
2019-10-02 00:40:00    0.000489
2019-10-02 00:50:00    0.000426
2019-10-02 01:00:00    0.000394
Name: vol, dtype: float64

In [17]:
side_ = pd.Series(1.0, index=target.index)

In [18]:
pt_sl_ = [2, 2]

In [87]:
events = pd.concat({'ent': vertical_barrier_times, 'trgt': target, 'side': side_}, axis=1)

In [88]:
events = events.dropna(subset=['side'])

In [89]:
events = events.fillna(value={'trgt': 0.001})

In [90]:
events = events.dropna(subset=['trgt'])

In [91]:
events.head()


Unnamed: 0_level_0,ent,side,trgt
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-02 00:20:00,2019-10-02 01:20:00,1.0,0.00013
2019-10-02 00:30:00,2019-10-02 01:30:00,1.0,0.000184
2019-10-02 00:40:00,2019-10-02 01:40:00,1.0,0.000489
2019-10-02 00:50:00,2019-10-02 01:50:00,1.0,0.000426
2019-10-02 01:00:00,2019-10-02 02:00:00,1.0,0.000394


In [80]:
# Snippet 3.1, page 44, Daily Volatility Estimates
from util.multiprocess import mp_pandas_obj, drop_labels

ImportError: cannot import name 'drop_labels' from 'util.multiprocess' (E:\git_folder\quant_models\util\multiprocess.py)

In [25]:
# Snippet 3.2, page 45, Triple Barrier Labeling Method
def apply_pt_sl_on_ent(close, events, pt_sl, molecule):
    """
    Snippet 3.2, page 45, Triple Barrier Labeling Method

    This function applies the triple-barrier labeling method. It works on a set of
    datetime index values (molecule). This allows the program to parallelize the processing.

    Mainly it returns a DataFrame of timestamps regarding the time when the first barriers were reached.

    :param
    1. close: A pandas series of prices
    2. events: dataframe with two columns: ent: the timestamp of vertical barrier, when the value is np.nan, then no vertical bar
                                           trgt: the unit width of the horizontal barriers.
    3. pts1: pts1[0]*trgt is the
    4. molecule: a list with the subset of event indcies that will be processed by a single thread.
    :param pt_sl: (array) element 0, indicates the profit taking level; element 1 is stop loss level
    :param molecule: (an array) a set of datetime index values for processing
    :return: DataFrame of timestamps of when first barrier was touched
    """
    # Apply stop loss/profit taking, if it takes place before ent (end of event)
    events_ = events.loc[molecule]
    out     = events_[['ent']].copy(deep=True)

    profit_taking_multiple = pt_sl[0]
    stop_loss_multiple     = pt_sl[1]

    # Profit taking active
    if profit_taking_multiple > 0:
        profit_taking = profit_taking_multiple * events_['trgt']
    else:
        profit_taking = pd.Series(index=events.index)  # NaNs

    # Stop loss active
    if stop_loss_multiple > 0:
        stop_loss = -stop_loss_multiple * events_['trgt']
    else:
        stop_loss = pd.Series(index=events.index)  # NaNs

    # Get events
    for loc, vertical_barrier in events_['ent'].fillna(close.index[-1]).iteritems():
        closing_prices = close[loc: vertical_barrier]  # Path prices for a given trade
        cum_returns = (closing_prices / close[loc] - 1) * events_.at[loc, 'side']  # Path returns
        out.loc[loc, 'sl'] = cum_returns[cum_returns < stop_loss[loc]].index.min()  # Earliest stop loss date
        out.loc[loc, 'pt'] = cum_returns[cum_returns > profit_taking[loc]].index.min()  # Earliest profit taking date
    return out

In [26]:
num_threads = 2

In [27]:
events.head()

Unnamed: 0_level_0,ent,side,trgt
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-02 00:20:00,2019-10-02 01:20:00,1.0,0.00013
2019-10-02 00:30:00,2019-10-02 01:30:00,1.0,0.000184
2019-10-02 00:40:00,2019-10-02 01:40:00,1.0,0.000489
2019-10-02 00:50:00,2019-10-02 01:50:00,1.0,0.000426
2019-10-02 01:00:00,2019-10-02 02:00:00,1.0,0.000394


In [92]:
first_touch_dates = apply_pt_sl_on_ent(df_sampled.open, events, pt_sl_, events.index)

In [43]:
first_touch_dates.head(20)

Unnamed: 0_level_0,ent,sl,pt
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-02 00:20:00,2019-10-02 01:20:00,2019-10-02 00:40:00,2019-10-02 00:50:00
2019-10-02 00:30:00,2019-10-02 01:30:00,2019-10-02 00:40:00,2019-10-02 00:50:00
2019-10-02 00:40:00,2019-10-02 01:40:00,NaT,2019-10-02 00:50:00
2019-10-02 00:50:00,2019-10-02 01:50:00,2019-10-02 01:10:00,NaT
2019-10-02 01:00:00,2019-10-02 02:00:00,2019-10-02 01:10:00,NaT
2019-10-02 01:10:00,2019-10-02 02:10:00,NaT,NaT
2019-10-02 01:20:00,2019-10-02 02:20:00,NaT,NaT
2019-10-02 01:30:00,2019-10-02 02:30:00,2019-10-02 02:10:00,NaT
2019-10-02 01:40:00,2019-10-02 02:40:00,2019-10-02 02:10:00,NaT
2019-10-02 01:50:00,2019-10-02 02:50:00,2019-10-02 02:10:00,NaT


In [44]:
events.head(20)

Unnamed: 0_level_0,ent,trgt,endt,pt,sl
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-10-02 00:20:00,2019-10-02 01:20:00,0.00013,2019-10-02 00:40:00,2,2
2019-10-02 00:30:00,2019-10-02 01:30:00,0.000184,2019-10-02 00:40:00,2,2
2019-10-02 00:40:00,2019-10-02 01:40:00,0.000489,2019-10-02 00:50:00,2,2
2019-10-02 00:50:00,2019-10-02 01:50:00,0.000426,2019-10-02 01:10:00,2,2
2019-10-02 01:00:00,2019-10-02 02:00:00,0.000394,2019-10-02 01:10:00,2,2
2019-10-02 01:10:00,2019-10-02 02:10:00,0.000466,2019-10-02 02:10:00,2,2
2019-10-02 01:20:00,2019-10-02 02:20:00,0.000431,2019-10-02 02:20:00,2,2
2019-10-02 01:30:00,2019-10-02 02:30:00,0.000412,2019-10-02 02:10:00,2,2
2019-10-02 01:40:00,2019-10-02 02:40:00,0.0004,2019-10-02 02:10:00,2,2
2019-10-02 01:50:00,2019-10-02 02:50:00,0.000428,2019-10-02 02:10:00,2,2


In [54]:
gg = first_touch_dates.dropna(how='all').min(axis=1)

In [55]:
gg.head()

datetime
2019-10-02 00:20:00   2019-10-02 00:40:00
2019-10-02 00:30:00   2019-10-02 00:40:00
2019-10-02 00:40:00   2019-10-02 00:50:00
2019-10-02 00:50:00   2019-10-02 01:10:00
2019-10-02 01:00:00   2019-10-02 01:10:00
dtype: datetime64[ns]

In [65]:
events['ent'] = first_touch_dates.dropna(how='all').min(axis=1)

In [66]:
events.head()

Unnamed: 0_level_0,ent,side,trgt
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-02 00:20:00,2019-10-02 00:40:00,1.0,0.00013
2019-10-02 00:30:00,2019-10-02 00:40:00,1.0,0.000184
2019-10-02 00:40:00,2019-10-02 00:50:00,1.0,0.000489
2019-10-02 00:50:00,2019-10-02 01:10:00,1.0,0.000426
2019-10-02 01:00:00,2019-10-02 01:10:00,1.0,0.000394


In [37]:
#events = events.drop('side', axis=1)
events['pt'] = 2
events['sl'] = 2

In [52]:
events.head(2)

Unnamed: 0_level_0,ent,side,trgt
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-02 00:20:00,2019-10-02 01:20:00,1.0,0.00013
2019-10-02 00:30:00,2019-10-02 01:30:00,1.0,0.000184


In [93]:

first_touch_dates = mp_pandas_obj(func=apply_pt_sl_on_ent,
                                  pd_obj=('molecule', events.index),
                                  num_threads=3,
                                  close=df_sampled.open,
                                  events=events,
                                  pt_sl=pt_sl_)


In [0]:
#hh = apply_pt_sl_on_ent(df_testprice, events, pt_sl_, events.index)

In [67]:
events = events.drop('side', axis=1)

In [70]:
events['pt'] = 2
events['sl'] = 2

In [81]:
from labeling import get_bins, drop_labels

In [73]:
hh = get_bins(events, df_sampled.open)

In [78]:
hh.head(30)

Unnamed: 0_level_0,ret,trgt,bin
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-02 00:20:00,-0.00037,0.00013,-1
2019-10-02 00:30:00,-0.00037,0.000184,-1
2019-10-02 00:40:00,0.001297,0.000489,1
2019-10-02 00:50:00,-0.001295,0.000426,-1
2019-10-02 01:00:00,-0.000926,0.000394,-1
2019-10-02 01:10:00,-0.000926,0.000466,0
2019-10-02 01:20:00,-0.000185,0.000431,0
2019-10-02 01:30:00,-0.001112,0.000412,-1
2019-10-02 01:40:00,-0.000926,0.0004,-1
2019-10-02 01:50:00,-0.001297,0.000428,-1


In [83]:
new_events = drop_labels(hh)

dropped label:  -1 0.007261724659606656
The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.
  print('dropped label: ', df0.argmin(), df0.min())
The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.
  events = events[events['bin'] != df0.argmin()]


In [84]:
new_events.count()

ret     6562
trgt    6562
bin     6562
dtype: int64

In [86]:
hh.count()

ret     6610
trgt    6610
bin     6610
dtype: int64