In [51]:
import pandas as pd
import numpy as np
import numba
from numba import jit
from tqdm import tqdm
import datetime

In [28]:
df = pd.read_pickle('intra2021.pkl')

In [43]:
daily = pd.read_pickle('daily2021.pkl')

In [29]:
df

Unnamed: 0,c,h,l,o,s,t,v,tt,symbol
0,2.310,2.310,2.310,2.310,ok,1609754520,200,2021-01-04 05:02:00-05:00,OVID
1,2.270,2.270,2.270,2.270,ok,1609755780,300,2021-01-04 05:23:00-05:00,OVID
2,2.250,2.250,2.250,2.250,ok,1609755840,415,2021-01-04 05:24:00-05:00,OVID
3,2.200,2.200,2.200,2.200,ok,1609755900,335,2021-01-04 05:25:00-05:00,OVID
4,2.230,2.230,2.230,2.230,ok,1609757160,554,2021-01-04 05:46:00-05:00,OVID
5,2.230,2.230,2.230,2.230,ok,1609757640,690,2021-01-04 05:54:00-05:00,OVID
6,2.230,2.230,2.230,2.230,ok,1609757880,6600,2021-01-04 05:58:00-05:00,OVID
7,2.270,2.270,2.270,2.270,ok,1609759980,148,2021-01-04 06:33:00-05:00,OVID
8,2.310,2.310,2.310,2.310,ok,1609761600,131,2021-01-04 07:00:00-05:00,OVID
9,2.310,2.310,2.310,2.310,ok,1609761660,884,2021-01-04 07:01:00-05:00,OVID


In [52]:
stock_list = daily.symbol.unique().tolist()

In [None]:
for stock in stock_list:
    stock_index = daily[daily['symbol']==stock].index
    log_return = np.log(daily[daily['symbol']==stock]['c']).diff()
    
    
    

In [68]:
def update_vol(stock, lookback=30):
    """
    Calculating daily volatility for dynamic thresholds
    :param stock: (data frame) stock dataframe
    :param lookback: (int) lookback period to compute volatility (number of bars)
    :return: (series) of daily volatility value
    """
    stock_list = stock.symbol.unique().tolist()
    for s in tqdm(stock_list):
        stock_index = stock[stock['symbol']==s].index
#         log_return = np.log(stock[stock['symbol']==s]['c']).diff()
#         daily_vol = log_return.ewm(span=lookback, adjust=False).std()
        stock.loc[stock_index, f"{lookback}vol"] = stock.c.ewm(span=lookback, min_periods=1, adjust=False).std()
        
    return stock

def get_vol(date, symbol, stock, lookback):
    value = stock[(stock['tt'].dt.date == date) & (stock['symbol'] == symbol)][f"{lookback}vol"]
    return value


def update_tbm_barrier(stock, ratio=[1,1], period=30, lookback=30):
    '''
    ratio: the profit : loss ratio. example: [3,1]
    period: the minutes of holding period
    '''
    stock['lower_barrier'] = stock['c']*(1- ratio[1]*stock[f"{lookback}vol"])
    stock['upper_barrier'] = stock['c']*(1+ ratio[0]*stock[f"{lookback}vol"])
    stock['vertical_barrier'] = stock['tt'] + pd.Timedelta(period, unit='m')
    return stock
    

@jit(nopython=True)
def tbm_label(cluarray, period=30):
    close = cluarray[:,0]
    
    length = len(close)
    label_array = np.empty(length)
    for i in range(length):
        low_barrier = cluarray[i,1]
        up_barrier = cluarray[i,2]
        for c in close[i:i+period]:
            if c>up_barrier:
                label = 1
                break
            if c<low_barrier:
                label = -1
                break
            else:
                label =0
        label_array[i] = label
    return label_array
    
    
    
    
def update_tbm_label(stock, period=30, lookback=30):
    stock = update_vol(stock, lookback=lookback)
    stock = update_tbm_barrier(stock, lookback=lookback)
    stock = stock.dropna()
    stock_list = stock.symbol.unique().tolist()
    for symbol_name in tqdm(stock_list):
        df0 = stock[stock.symbol == symbol_name].copy()
        index0 = df0.index.tolist()
        arr = df0[['c', 'lower_barrier', 'upper_barrier']].dropna().to_numpy()
        labels = tbm_label(arr, period=period)
        stock.loc[index0, 'label'] = labels
        
    stock= stock.drop(['30vol', 'lower_barrier', 'upper_barrier', 'vertical_barrier'], axis=1)
    
    return stock


def ewm_label(stock, threshold=30, period=30):
    stock_list = stock.symbol.unique().tolist()
    for symbol_name in tqdm(stock_list):
        df0 = stock[stock.symbol == symbol_name].copy()
        index0 = df0.index.tolist()
        df0.loc[index0, f"ema_{period}"]= df0.c.ewm(span=period, adjust=False).mean()
        df0[f"ema_{period}"] = df0[f"ema_{period}"].shift(-30)
        df0[f"ema_{period}"] = df0[f"ema_{period}"].fillna(0)
        df0.loc[index0, 'label'] = (df0[f"ema_{period}"]>df0.c)*1
        df1 = pd.DataFrame()
        df1 = df0[(df0.tt.dt.time>=datetime.time(9, 30)) & (df0.tt.dt.time<=datetime.time(15, 30))]
        index1 = df1.index.tolist()
        stock.loc[index1, 'label'] = df1.loc[index1, 'label']
        
    return stock
        
        
        
    
        
        
        
        


        
    

In [122]:
@jit(nopython=True)
def func2(cluarray, period=30):
    close = cluarray[:,0]
    
    length = len(close)
    label_array = np.empty(length)
    for i in range(length):
        low_barrier = cluarray[i,1]
        up_barrier = cluarray[i,2]
        for c in close[i:i+period]:
            if c>up_barrier:
                label = 1
                break
            if c<low_barrier:
                label = -1
                break
            else:
                label =0
        label_array[i] = label
    return label_array
        
    
    
    
    

In [123]:
@jit(nopython=True)
def go_fast(np_array):
    input_length = np_array.shape[0]
    label_array = np.empty(input_length)

    for i in range(input_length):
        low, high = np_array[i,1:3]
        window_close = np_array[i+1:i+31,0]

        for close in window_close:
            if close > high:
                label = 1
                break
            elif close < low:
                label = -1
                break
            else:
                label = 0

        label_array[i] = label

    return label_array

In [30]:
OSTK = df[df.symbol == 'OSTK'].copy()

16834459    1
16834460    0
16834461    0
16834462    0
16834463    0
16834464    0
16834465    0
16834466    0
16834467    0
16834468    0
16834469    0
16834470    0
16834471    0
16834472    0
16834473    0
16834474    0
16834475    0
16834476    0
16834477    1
16834478    1
16834479    0
16834480    1
16834481    0
16834482    0
16834483    0
16834484    0
16834485    0
16834486    0
16834487    1
16834488    0
16834489    0
16834490    0
16834491    0
16834492    0
16834493    0
16834494    0
16834495    0
16834496    0
16834497    1
16834498    1
16834499    0
16834500    1
16834501    0
16834502    0
16834503    0
16834504    0
16834505    0
16834506    0
16834507    0
16834508    0
16834509    1
16834510    1
16834511    0
16834512    0
16834513    1
16834514    0
16834515    1
16834516    0
16834517    1
16834518    0
16834519    0
16834520    1
16834521    1
16834522    0
16834523    0
16834524    1
16834525    0
16834526    0
16834527    0
16834528    1
16834529    0
168345

In [4]:
df1 = update_vol(df)

100%|██████████| 188/188 [05:02<00:00,  1.61s/it]


In [7]:
(df1['30vol']/df1.c).mean()

0.005290338623467812

In [52]:
update_tbm_label(OSTK)

100%|██████████| 1/1 [00:00<00:00, 23.83it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = empty_value
100%|██████████| 1/1 [00:00<00:00,  3.44it/s]


Unnamed: 0,c,h,l,o,s,t,v,tt,symbol,30vol,lower_barrier,upper_barrier,vertical_barrier,label
16834460,48.50,48.50,48.50,48.50,ok,1609750860,100,2021-01-04 09:01:00+00:00,OSTK,0.212132,38.211596,58.788404,2021-01-04 09:31:00+00:00,0.0
16834461,48.50,48.50,48.50,48.50,ok,1609751460,400,2021-01-04 09:11:00+00:00,OSTK,0.208451,38.390138,58.609862,2021-01-04 09:41:00+00:00,0.0
16834462,48.50,48.50,48.50,48.50,ok,1609751880,430,2021-01-04 09:18:00+00:00,OSTK,0.204720,38.571078,58.428922,2021-01-04 09:48:00+00:00,0.0
16834463,48.80,48.80,48.80,48.80,ok,1609752000,770,2021-01-04 09:20:00+00:00,OSTK,0.178094,40.109010,57.490990,2021-01-04 09:50:00+00:00,0.0
16834464,48.80,48.80,48.80,48.80,ok,1609752060,218,2021-01-04 09:21:00+00:00,OSTK,0.159808,41.001385,56.598615,2021-01-04 09:51:00+00:00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16928032,62.00,62.01,61.99,61.99,ok,1640898480,1138,2021-12-30 21:08:00+00:00,OSTK,0.478366,32.341295,91.658705,2021-12-30 21:38:00+00:00,0.0
16928033,61.71,61.71,61.71,61.71,ok,1640899620,282,2021-12-30 21:27:00+00:00,OSTK,0.497059,31.036496,92.383504,2021-12-30 21:57:00+00:00,0.0
16928034,60.75,60.80,60.75,60.80,ok,1640904540,900,2021-12-30 22:49:00+00:00,OSTK,0.485819,31.236509,90.263491,2021-12-30 23:19:00+00:00,0.0
16928035,60.76,60.76,60.76,60.76,ok,1640910000,499,2021-12-31 00:20:00+00:00,OSTK,0.474082,31.954802,89.565198,2021-12-31 00:50:00+00:00,0.0


In [67]:
%time df1 = update_tbm_label(df)

100%|██████████| 188/188 [05:00<00:00,  1.60s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = empty_value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
100%|██████████| 188/188 [03:34<00:00,  1.14s/it]


Wall time: 8min 44s


In [58]:
df2 = ewm_label(df)

In [69]:
df2.label = df1.label.astype('int32')

In [70]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16928036 entries, 1 to 16928036
Data columns (total 10 columns):
 #   Column  Dtype              
---  ------  -----              
 0   c       float64            
 1   h       float64            
 2   l       float64            
 3   o       float64            
 4   s       object             
 5   t       object             
 6   v       object             
 7   tt      datetime64[ns, UTC]
 8   symbol  object             
 9   label   int32              
dtypes: datetime64[ns, UTC](1), float64(4), int32(1), object(4)
memory usage: 1.8+ GB


In [72]:
import pickle
import os

In [73]:
file = 'intra2021_label.pkl'

In [74]:
with open(file, 'wb') as f:
    pickle.dump(df2, f)