# (4) Labelling 

The labeling process will be made for the SMA 21,9 entry points. This labelling process uses the limit barriers that are not used in the labeling V1, the vertical barrier will be determined from asimple Bayesian waiting-time model for trade durations using an exponential distribution. 

## (4.1) Triple Barrier Method

using the SMA , EMA and RSI market signals for entering and exiting the market. this returns all the data the happened during the labelling process

In [224]:
class triple_barrier_method():

    """
    ptSl[0]: The factor that multiplies trgt to set the width of the upper barrier.
    If 0, there will not be an upper barrier.
    ptSl[1]: The factor that multiplies trgt to set the width of the lower barrier.
    If 0, there will not be a lower barrier.
    """
    
    def __init__(self,trade,qty,ptSl,signal):
        self.trades = []
        self.trades.append(trade)
        self.qty = qty
        
        self.entry_signal = signal.to_dict()
        self.upper_barrier = ptSl[0]
        self.lower_barrier = ptSl[1]
        self.time_barrier = ptSl[2]
        
    def add_trade(self,tickdata):
        self.trades.append(tickdata)
    
    def get_trade_list(self):
        return self.trades

    def get_upper_barrier(self):
        return self.upper_barrier

    def get_lower_barrier(self):
        return self.lower_barrier

    def get_time_barrier(self):
        return self.time_barrier
    
    def end_barrier(self,signal):
        tickdata = pd.DataFrame(self.trades)
        self.exit_signal = signal.to_dict()
        
        ## calculate the change form first and last price
        if len(tickdata) < 2:
            return 0.0
            
        first =  (tickdata.iloc[0]["price"])
        last = (tickdata.iloc[-1]["price"])
        change = (last - first) / first
        
        entry_dict = {"timestamp":tickdata.iloc[0]["timestamp"],"price":first}
        out_entry = entry_dict | self.entry_signal
    
        exit_dict = {"timestamp":tickdata.iloc[-1]["timestamp"],"price":last}
        out_exit = exit_dict | self.exit_signal

        waiting_time = (tickdata.iloc[-1]["timestamp"]).timestamp() - (tickdata.iloc[0]["timestamp"]).timestamp() 
        
        out = {"entry": out_entry, "exit": out_exit, "qty":self.qty, "change":change,"upper barrier": self.upper_barrier,"lower barrier":self.lower_barrier,"trade duration":waiting_time}
        
        self.trades = []
        return(pd.DataFrame([out]))

    def remove_all_data(self):
        self.trades = []
        self.entry_signal = None
        self.upper_barrier = None
        self.lower_barrier = None
        self.time_barrier = None
        self.exit_signal = None
        

## (4.2) Bayesian Modeling of Trade Waiting Times

In [225]:
from scipy.stats import gamma

class calculate_trade_wait_time():

    ## update priors on sucessful trades ##
    ## the model is a gamma posterior on a exponential likelihood and a gamma prior
    
    def __init__(self,priors):
        # save priors
        self.priors = priors
        # variables that will be updated
        self.alpha = priors[0] 
        self.beta = priors[1]

    def update_parameters(self,trade_time):
        self.alpha += 1
        self.beta += trade_time
        
    def get_posteriors(self):
        # output: [alpha,beta]
        return [self.alpha,self.beta]

    def sample_time(self):
        return gamma.rvs(a=self.alpha, scale=1/self.beta, size=1)[0]
        

# (5) Parallel optimization (Labelling process)

In [None]:
from concurrent.futures import ProcessPoolExecutor
from itertools import combinations
from itertools import repeat
import os

def labelling_process_function(bartype,signal,data,MatchTrades_tickdata):

    ## Important variables to run the labelling process ##
    total_positions = list(data.values())[2]
    signals_df = list(data.values())[0]
    stats_df = list(data.values())[1]
    positions = pd.DataFrame()

    time_barrier = list(data.values())[3]
    open_position = False
    ## ------------------------------------------------ ##
    
    for Matchtrade in MatchTrades_tickdata.itertuples(index=True, name="Trade"):    

        timestamp = Matchtrade.timestamp
        # ----------------------------- #
        try:
            if (tbm.get_trade_list() != []):
                tbm.add_trade(Matchtrade)
        except: 
            pass

        try:
            if timestamp == signals_df.iloc[0]["timestamp"]:
                # signals
                signal_x = signals_df.iloc[0]
                signals_df = signals_df.iloc[1:].reset_index(drop=True)
                # stasts 
                stats_x = stats_df.iloc[0]
                stats_df = stats_df.iloc[1:].reset_index(drop=True)
            # ----------------------------- #
            #       Buying strategy         # 
                try:  
                    # start position
                    if (((signal_x[1:] == 1).all()) and (open_position == False)):

                        open_position = True
                        qty = 1
                        
                        # profit-taking, stop-loss and time limits 
                        barriers = [round(Matchtrade.price + 0.05*Matchtrade.price,2),round(Matchtrade.price - 0.02*Matchtrade.price,2), round((Matchtrade.timestamp).timestamp(),3) + round(time_barrier.sample_time(),3)*60]
                        signal_trade = pd.concat([signal_x[1:], stats_x[1:]],axis=0)                        
                        tbm = triple_barrier_method(Matchtrade,qty=qty,ptSl = barriers,signal=signal_trade)
                except:
                    pass
            # ----------------------------- #
            # ----------------------------- #
        except:
            pass

        # ----------------------------------------------------- # 
        # ----------------------------------------------------- #        
        try:
            
            if ((tbm.get_upper_barrier() <= Matchtrade.price) or (tbm.get_lower_barrier() >= Matchtrade.price) or (round(tbm.get_time_barrier(),3) <= timestamp.timestamp())
               or ((signal_x[1:] == -1).all())):

                open_position = False
        
                signal_trade = pd.concat([signal_x[1:], stats_x[1:]],axis=0)                        
                result = tbm.end_barrier(signal=signal_trade)
                #result["change"] = result["change"] - 0.02 
                
                if result.iloc[0]["change"] > 0.002:
                    time_barrier.update_parameters(result.iloc[0]["trade duration"])

                positions = pd.concat([positions, result], ignore_index=True)
                tbm.remove_all_data()
        except:
            pass
        # ----------------------------------------------------- # 
        # ----------------------------------------------------- #  

    # save the positions from all the different files
    total_positions =  pd.concat([total_positions, positions], ignore_index=True)
    print(total_positions)
    
    ##########################################
    ##          Save signals data           ##
    
    raw_data = total_positions.to_dict(orient="records")
    json_raw_data = json_util.dumps(raw_data)
    
    with open(f"Positions/{bartype}/{signal}.json", "w") as f:
                f.write(json_raw_data)

    ##########################################
    ##########################################

    return({"variable":signal,"total_positions":total_positions,"time_barrier_post":time_barrier.get_posteriors()})
    
def wrapper(args):
    return labelling_process_function(*args)

class Labelling_Process():
    
    ## Documentation ##
    # This class receives two dataframes:
    # - data: a dataframe that contain all the timestamp and all the stats used to calculate the signal
    # - signals: a dataframe that contain rows with 1, -1 and 0 for entry and exiting of positions, 
    # signals are created from the statistics.
 
    # This function is used to test multiple strategies at the same time, and make combinations of diferent signals to find
    # entry and exiting points for market research. 

    def update_variables(self,variable,total_positions,time_barrier_params):
        
        self.variables_simulation[variable]["total_positions"] = total_positions
        self.variables_simulation[variable]["time_barrier"] = calculate_trade_wait_time(priors=time_barrier_params)

    def return_results(self):

        return (self.variables_simulation)
    
    def __init__(self,bartype,stats_data,signals,complementary_stats,MatchTrades_tickdata,parallel_processing = False):

        # ----------------------------------------------------- #
        #    (1)       Test for variable types                  #
        
        # receive three dataframes, one with the statistics and another with the signals for market positions.
        # 1. stats_data: statistics made for creating the signals.
        # 2. signals: a dataframe containing the entry and exiting signals based on the the statistics data.
        # 3. complementary_stats: a list of names of the statistics that are used inside of the model and have nothing to do with
        # signals created.
        
        # test that the data types of the variables of signals are int16
        # 1. Check first column type
        first_col = signals.columns[0]
        if not np.issubdtype(signals[first_col].dtype, np.datetime64):
            raise TypeError(f"First column '{first_col}' must be datetime64[ns], got {signals[first_col].dtype}")
        
        # 2. Check other columns type
        for col in signals.columns[1:]:
            if signals[col].dtype != "int16":
                raise TypeError(f"Column '{col}' must be float32, got {signals[col].dtype}")

        # test if the variables of data are float 32 and the variable timestamp is a timestamp type.
        first_col = stats_data.columns[0]
        if not np.issubdtype(stats_data[first_col].dtype, np.datetime64):
            raise TypeError(f"First column '{first_col}' must be datetime64[ns], got {stats_data[first_col].dtype}")
                    
        self.stats_data = stats_data
        self.signals = signals
        self.MatchTrades_tickdata = MatchTrades_tickdata
        
        #   (1) End                                             #
        # ----------------------------------------------------- #

        # --------------------------------------------------------------------- #
        # (2) Set up variables for the individual and combination of variables  #

        # variables for Initialization of the simulation #
        self.variables_simulation = {}
        
        self.df_variables = []
        # make combination of the column names in order to meke the variables
        for n in range(1,len(signals.columns[1:])+1):
            combos = list(combinations(signals.columns[1:], r=n))

            for combo in combos:
                self.df_variables.append(combo)
                
                if len(combo) == 1:
                    # single element → replace spaces with underscore
                    variable = combo[0].replace(" ", "_")
                else:
                    # multiple elements → keep common prefix once
                    prefix = combo[0].split()[0]  # e.g., "Signal"
                    suffixes = [s.split()[-1] for s in combo]
                    variable = prefix + "_" + "_".join(suffixes)

                # ----------------------------------------------------- #
                # list of column names from data that match the signals
                # Extract individual signals from variable
                if variable.startswith("Signal_"):
                    parts = variable.replace("Signal_", "").split("_")
                else:
                    parts = variable.split("_")
                
                # Map signals to actual df columns
                matched_cols = []
                for part in parts:
                    for col in stats_data.columns:
                        if col.startswith(part):
                            matched_cols.append(col)

                self.variables_simulation[variable] = {
                    "signals_data": signals[["timestamp"]+list(combo)],
                    "stats_data": stats_data[["timestamp"]+matched_cols+complementary_stats],
                    "total_positions": pd.DataFrame(),
                    "time_barrier": calculate_trade_wait_time(priors=[60,1]),
                }

                # ----------------------------------------------------- #
                
        # (2) End                                                               #
        # --------------------------------------------------------------------- #

        # --------------------------------------------------------------------- #
        # (3) Parallel processing 

        # Bundle into tuples
        input_parallel_data = list(zip(repeat(bartype),self.variables_simulation.keys(), self.variables_simulation.values(),repeat(self.MatchTrades_tickdata)))
        
        if (parallel_processing == True):
             
            with ProcessPoolExecutor(max_workers=3) as executor:
                results = list(executor.map(wrapper,input_parallel_data))

        ## update the class variables ##
        for value in results:
            self.update_variables(value["variable"],value["total_positions"],value["time_barrier_post"])
            
        # (3) End                                                               #
        # --------------------------------------------------------------------- #
        

# (6) Main code (TickBars, VolumeBars VolumeImbalanceBars)

In [287]:
from pathlib import Path

folders = ["TickBars","VolumeBars","VolumeImbalanceBars"]

# sort data files from the latest to newest 
path = Path("Data")
files_sorted = sorted(
    path.iterdir(),
    key=lambda f: f.stat().st_mtime, 
    reverse=False                      
)

data_files = []

for f in files_sorted:
    data_files.append(f.name)
    
data_files = [f for f in data_files if f != '.ipynb_checkpoints']

### resulf from all the files ###

for folder in folders:

    results = []
    
    print(folder)

    path = Path(f"Signals/{folder} Signals")
    signals_files_sorted = sorted(
        path.iterdir(),
        key=lambda f: f.stat().st_mtime, 
        reverse=False                     
    )

    signals_files = []
    
    for f in signals_files_sorted:
        signals_files.append(f.name)

    signals_files = [f for f in signals_files if f != '.ipynb_checkpoints']
    
    for data_file,signal_file in zip(data_files,signals_files):

        # Load the match trades tick data
        with open("Data/"+data_file) as f:
            MatchTrades_tickdata = (pd.read_json(f)[["timestamp","qty","price"]]).astype({"qty":"float32","price":"float64"})
            MatchTrades_tickdata["timestamp"] = pd.to_datetime(MatchTrades_tickdata["timestamp"], unit='ms')

        # print(MatchTrades_tickdata)
        
        # load the signal data
        with open("Signals/"+f"{folder} Signals/"+signal_file) as f:
            signal_df = pd.read_json(f).astype({"Signal SMA":"int16","Signal EMA":"int16","Signal RSI":"int16"})
            signal_df["timestamp"] = pd.to_datetime(signal_df['timestamp'], unit='ms')

        # print(signal_df)

        bartype = folder
        signals = signal_df.iloc[:,[0,10,11,12]]
        stats_data = signal_df.iloc[:,:10]
        MatchTrades_tickdata = MatchTrades_tickdata
        complementary_stats = ["ATR"]
    
        LabellingProcess = Labelling_Process(bartype,stats_data,signals,complementary_stats,MatchTrades_tickdata,parallel_processing=True)

        if results == []:
            for signal in LabellingProcess.return_results():
                signal_results = (LabellingProcess.return_results()[signal])["total_positions"]
                results.append({"signal":signal,"total_positions":signal_results})
        
        else:        
            for signal,index in zip(LabellingProcess.return_results(),range(0,len(LabellingProcess.return_results()))):
                signal_results = (LabellingProcess.return_results()[signal])["total_positions"]
                ((results[index])["total_positions"]) = pd.concat([(results[index])["total_positions"],signal_results])
                
    for positions in results:
        raw_data = positions["total_positions"].to_dict(orient="records")
        json_raw_data = json_util.dumps(raw_data)
        signal = positions["signal"]
        
        with open(f"Positions/{folder}/{signal}.json", "w") as f:
                f.write(json_raw_data)

TickBars
VolumeBars
VolumeImbalanceBars
