In [20]:
import pandas as pd
import numpy as np
import seaborn as sns

PATH_READ = 'train_with_features_v2.csv'
PATH_WRITE = 'train_with_features_TB.csv'
df = pd.read_csv(PATH_READ)

In [21]:
def getDailyVol(data, span=100):
    df = data.assign(Return = lambda x: data['Close'] / data['Close'].shift(1)-1)
    sigma = df['Return'].ewm(span=span).std()
    return sigma

vol = getDailyVol(data=df)
print(vol)

0            NaN
1            NaN
2       0.000609
3       0.002759
4       0.002312
          ...   
2890    0.002385
2891    0.002369
2892    0.002346
2893    0.002398
2894    0.002376
Name: Return, Length: 2895, dtype: float64


In [22]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

events = df[['Timestamp']].copy(deep=True)
events['VB'] = df['Timestamp'] + pd.Timedelta(minutes=15)
events['Vol'] = vol

print(events)

               Timestamp                  VB       Vol
0    2023-06-01 09:45:00 2023-06-01 10:00:00       NaN
1    2023-06-01 09:46:00 2023-06-01 10:01:00       NaN
2    2023-06-01 09:47:00 2023-06-01 10:02:00  0.000609
3    2023-06-01 09:48:00 2023-06-01 10:03:00  0.002759
4    2023-06-01 09:49:00 2023-06-01 10:04:00  0.002312
...                  ...                 ...       ...
2890 2023-06-20 14:42:00 2023-06-20 14:57:00  0.002385
2891 2023-06-20 14:43:00 2023-06-20 14:58:00  0.002369
2892 2023-06-20 14:44:00 2023-06-20 14:59:00  0.002346
2893 2023-06-20 14:46:00 2023-06-20 15:01:00  0.002398
2894 2023-06-20 14:47:00 2023-06-20 15:02:00  0.002376

[2895 rows x 3 columns]


In [23]:
def TBL(df, events, width):
    
    res = events[['Timestamp', 'VB']].copy(deep=True)
    
    if width[0] > 0: events['UB'] = width[0]*events['Vol']
    else: events['UB'] = np.nan
        
    if width[1] > 0: events['DB'] = -width[1]*events['Vol']
    else: events['DB'] = np.nan
        
    for col,date,vb in res.itertuples():
        df0 = df[(df['Timestamp'] > date) & (df['Timestamp'] < vb)].copy(deep=True)
        df0['Return'] = df0['Close'] / df.loc[df['Timestamp'] == date, 'Close'].iloc[0]-1
        
        idx = (res['Timestamp'] == date)
        
        res.loc[idx, 'ut'] = df0.loc[df0['Return'] > events.loc[idx,'UB'].iloc[0], 'Timestamp'].min()
        res.loc[idx, 'dt'] = df0.loc[df0['Return'] < events.loc[idx,'DB'].iloc[0], 'Timestamp'].min()
            
    return res

In [25]:
def get_first_touch(df, events, width):
    res = TBL(df, events, width)
    res['First'] = res[['VB', 'ut', 'dt']].dropna(how='all').min(axis=1)
    return res

result = get_first_touch(df,events,width = [1,1])
print(result)

               Timestamp                  VB                  ut  \
0    2023-06-01 09:45:00 2023-06-01 10:00:00                 NaT   
1    2023-06-01 09:46:00 2023-06-01 10:01:00                 NaT   
2    2023-06-01 09:47:00 2023-06-01 10:02:00 2023-06-01 09:48:00   
3    2023-06-01 09:48:00 2023-06-01 10:03:00 2023-06-01 09:52:00   
4    2023-06-01 09:49:00 2023-06-01 10:04:00 2023-06-01 09:52:00   
...                  ...                 ...                 ...   
2890 2023-06-20 14:42:00 2023-06-20 14:57:00                 NaT   
2891 2023-06-20 14:43:00 2023-06-20 14:58:00                 NaT   
2892 2023-06-20 14:44:00 2023-06-20 14:59:00                 NaT   
2893 2023-06-20 14:46:00 2023-06-20 15:01:00                 NaT   
2894 2023-06-20 14:47:00 2023-06-20 15:02:00                 NaT   

                      dt               First  
0                    NaT 2023-06-01 10:00:00  
1                    NaT 2023-06-01 10:01:00  
2                    NaT 2023-06-01 09:48:

In [27]:
def get_label(df,result):
    result = result.dropna(subset=['First'])
    outcome = result[['Timestamp']].copy(deep=True)
    
    price_t0 = pd.merge(result,df,on=['Timestamp'],how='left')['Close']
    price_t1 = pd.merge(result,df,left_on=['First'], right_on=['Timestamp'], how = 'left')['Close']
    
    outcome['Return'] = price_t1/price_t0-1
    outcome['Label'] = np.sign(outcome['Return'].dropna())
    
    return outcome.dropna()

outcome = get_label(df,result)
print(outcome)

               Timestamp    Return  Label
0    2023-06-01 09:45:00  0.001161    1.0
1    2023-06-01 09:46:00 -0.002130   -1.0
2    2023-06-01 09:47:00  0.002333    1.0
3    2023-06-01 09:48:00  0.005335    1.0
4    2023-06-01 09:49:00  0.004978    1.0
...                  ...       ...    ...
2888 2023-06-20 14:40:00 -0.005273   -1.0
2889 2023-06-20 14:41:00 -0.002551   -1.0
2890 2023-06-20 14:42:00 -0.005873   -1.0
2891 2023-06-20 14:43:00 -0.004606   -1.0
2892 2023-06-20 14:44:00 -0.004189   -1.0

[2786 rows x 3 columns]
