# (1) Important Functions

## (1.1) Tick Bars

In [188]:
def generate_tick_bars(df):
    """
    Generate tick bars from tick-level data.
    
    Parameters:
        df (pd.DataFrame): Tick data with 'timestamp', 'price', and 'amount' columns
    
    Returns:
        pd.DataFrame: Tick bars with OHLCV data
    """
    
    assert all(col in df.columns for col in ['timestamp', 'price', 'amount']), "Missing required columns"
    
    # Aggregate all ticks into a single OHLCV bar
    bar = {
        'timestamp': df['timestamp'].iloc[0],        # Start time
        'open': round(df['price'].iloc[0],2),
        'high': round(df['price'].max(),2),
        'low': round(df['price'].min(),2),
        'close': round(df['price'].iloc[-1],2),
        'volume': round(df['amount'].sum(),5)
        #"vwap": round((df['price'] * df['amount']).sum() / df['amount'].sum(),2)
    }
    
    # Convert to DataFrame (optional)
    bar_df = pd.DataFrame([bar])
    
    # Output
    return bar_df

## (1.2) Calculate the properties

In [354]:
class predictors():

    def Relative_strength_index(self):
        n = 14
        
        average_gain= np.full((1,n-1),np.nan)
        average_loss= np.full((1,n-1),np.nan)
    
        for t in range(n,len(self.changes)+1):
            interval = np.array(self.changes[t-n:t])
    
            gain = np.array([])
            loss = np.array([])
    
            for i in interval:
                if (i> 0):
                    gain = np.append(gain,i)
                elif(i< 0): 
                    loss = np.append(loss,i)
            
            # average gain
            try:
                average_gain = np.append(average_gain,mean(gain))
            except:
                average_gain = np.append(average_gain,1)
    
            # average loss
            try:
                average_loss = np.append(average_loss,mean(abs(loss)))
            except:
                average_loss = np.append(average_loss,1) 
    
        RS = np.array(average_gain/average_loss)
        RSI = np.array(100 - (100/(1+RS)))

        res = np.append(np.full((1,1),np.nan),RSI)
        return(res)

    def Average_true_range(self):

        # True Range components
        high_low = self.data['high'] - self.data['low']
        high_close_prev = (self.data['high'] - self.data['close'].shift(1)).abs()
        low_close_prev = (self.data['low'] - self.data['close'].shift(1)).abs()
        
        # True Range
        tr = pd.concat([high_low, high_close_prev, low_close_prev], axis=1).max(axis=1)
        
        # ATR (simple moving average of TR)
        atr = tr.rolling(window=14, min_periods=1).mean()
        
        return atr
    
    def Average_directional_index(self):
        
         # 1. Calculate True Range (TR)
        high_low = self.data['high'] - self.data['low']
        high_close_prev = (self.data['high'] - self.data['close'].shift(1)).abs()
        low_close_prev = (self.data['low'] - self.data['close'].shift(1)).abs()
        tr = pd.concat([high_low, high_close_prev, low_close_prev], axis=1).max(axis=1)
    
        # 2. Calculate directional movements
        plus_dm = self.data['high'] - self.data['high'].shift(1)
        minus_dm = self.data['low'].shift(1) - self.data['low']
    
        plus_dm = np.where((plus_dm > minus_dm) & (plus_dm > 0), plus_dm, 0.0)
        minus_dm = np.where((minus_dm > plus_dm) & (minus_dm > 0), minus_dm, 0.0)
    
        # 3. Smooth TR, +DM, -DM (Wilder's smoothing)
        tr_smooth = pd.Series(tr).rolling(window=14).sum()
        plus_dm_smooth = pd.Series(plus_dm).rolling(window=14).sum()
        minus_dm_smooth = pd.Series(minus_dm).rolling(window=14).sum()
    
        # 4. Calculate +DI and -DI
        plus_di = 100 * (plus_dm_smooth / tr_smooth)
        minus_di = 100 * (minus_dm_smooth / tr_smooth)
    
        # 5. Calculate DX
        dx = (abs(plus_di - minus_di) / (plus_di + minus_di)) * 100
    
        # 6. ADX = smoothed DX
        adx = dx.rolling(window=14).mean()
    
        # Return as DataFrame
        df_out = pd.DataFrame()
        df_out['+DI'] = plus_di
        df_out['-DI'] = minus_di
        df_out['ADX'] = adx
    
        return df_out
    
    def __init__(self,data):

        self.data = data
        
        #### changes from price #####
        self.changes = np.array([])
        close = np.array(data["close"])
        for t in range(1,len(close)):
            self.changes = np.append(self.changes,(close[t]-close[t-1]))

        ### moving averages ###
        SMA_9 = np.array(data["close"].rolling(window=9).mean())
        SMA_21 = np.array(data["close"].rolling(window=21).mean())
        EMA_9 = np.array(data["close"].ewm(span=9, adjust=False, min_periods=9).mean())
        EMA_21 = np.array(data["close"].ewm(span=21, adjust=False, min_periods=9).mean())

        ### Relative Strength Index (RSI) (14) ###
        RSI = self.Relative_strength_index()

        ### ATR (Average True Range) (14) ###
        ATR = self.Average_true_range()
        
        ### ADX (Average Directional Index) (14) ###
        ADX = self.Average_directional_index()
        
        ### predictors ###
        # "MiddleBand":BD["MiddleBand"],"UpperBand":BD["UpperBand"],"LowerBand":BD["LowerBand"],"MACD":MACD,"ATR":ATR,"ADX":ADX, 
        self.predictors = {"timestamp":self.data["timestamp"],"SMA 9":np.round(SMA_9,2),"SMA 21":np.round(SMA_21,2),"EMA 9":np.round(EMA_9,2),"EMA 21":np.round(EMA_21,2),
                             "RSI":np.round(RSI,2),"ATR":np.round(ATR,2),"+DI":np.round(ADX["+DI"],2),"-DI":np.round(ADX["-DI"],2),"ADX":np.round(ADX["ADX"],2)}
        
    def get_predictors(self):
        return(self.predictors)

# (2) Data Preprocessing / Cleaning
This code is used to:
- Data Loading
- Data Preprocessing / Cleaning:
    - Calculate standard bars (Tick bars), based in minute statistics for the thresholds, calculate exponential the moving averages for tresholds.

In [355]:
files_tickdata = os.listdir("Data")
files_tickdata = [f for f in files_tickdata if f != '.ipynb_checkpoints']
print(files_tickdata[2:])

['MatchTrades 2025-08-18 23:37:27 to 2025-08-21 13:36:43.json']


In [299]:
import json
import pandas as pd
import numpy as np
import os
from bson import json_util
from datetime import datetime
import asyncio
import nest_asyncio
import ipaddress
import pymongo
from datetime import datetime, timedelta
from statistics import mean, stdev

## --------------------------------------------------------- ##
##  (1)                 get all the files in the folder      ##
## --------------------------------------------------------- ##

files_tickdata = os.listdir("Data")
files_tickdata = [f for f in files_tickdata if f != '.ipynb_checkpoints']
files_tickdata = (files_tickdata[2:])
print(files_tickdata)
#print("\n")

## --------------------------------------------------------- ##
##  End (1)                                                  ##
## --------------------------------------------------------- ##

for file in files_tickdata:

    ## ----------------------------------------------------------- ##
    ##  (2)  Open files and Enforce Column Types through pandas    ##
    ## ----------------------------------------------------------- ##
    
    # Load the data
    # amount = float32
    # price = float64
    
    with open("Data/"+file) as f:
        MatchTrades_tickdata_df = pd.read_json(f).astype({"amount":"float32","price":"float64"})
        MatchTrades_tickdata_df["timestamp"] = pd.to_datetime(MatchTrades_tickdata_df["timestamp"], unit='ms')

    #MatchTrades_tickdata_df = MatchTrades_tickdata_df.iloc[0:1000][["timestamp","amount","price"]]
    #print(MatchTrades_tickdata_df)
    
    ## ----------------------------------------------------------- ##
    ##     End (2)                                                 ##
    ## ----------------------------------------------------------- ##
    
    ## ------------------------------------------------- ##
    ##  (3)  Calculate standard bars (Tick bars)         ##
    ## ------------------------------------------------- ##
    
    ## Convert timestamp from milliseconds to datetime
    df = MatchTrades_tickdata_df
    
    # create the minute aggregated data
    # volume = float32
    # n_ticks = int16
    
    minute_tick_bars = (
        df
        .set_index('timestamp')
        .resample('1min')
        .agg({'amount': 'sum'})
        .rename(columns={'amount': 'volume'})
        .assign(n_ticks=lambda x: df.set_index('timestamp').resample('1min').size())
        .astype({
            "volume": "float32",
            "n_ticks": "int16"
        })
    ).reset_index()
    
    minute_tick_bars["timestamp"] = pd.to_datetime(minute_tick_bars["timestamp"], unit='s')
    
    ## calculate exponential the moving averages for tresholds
    # volume_per_min = float32
    # n_ticks_per_min = int16
    
    volume_per_min = (minute_tick_bars["volume"]).ewm(span=60,adjust=False).mean().round(5).astype("float32")
    n_ticks_per_min = (minute_tick_bars["n_ticks"]).ewm(span=60,adjust=False).mean().round(0).astype("int16")
    
    ## minute data for creating the tick bars
    df_ema = pd.DataFrame({
        "timestamp": minute_tick_bars["timestamp"],
        "volume_per_min": volume_per_min,
        "n_ticks_per_min": n_ticks_per_min
    })
    #print(df_ema)
    
    ## create dataframes for creating Tick  bars 
    Tickdata_MatchTrades_TB = pd.DataFrame() # Tick data for the Tick Bar (TB)
    TickBars_df = pd.DataFrame()
    
    for row in MatchTrades_tickdata_df.itertuples(index=True, name="Trade"):
    
        timestamp = row.timestamp
        
        try:
            if timestamp.replace(second=0, microsecond=0) == df_ema.iloc[0]["timestamp"]:
                n_ticks_per_min = df_ema.iloc[0]["n_ticks_per_min"]
                df_ema = df_ema.drop(0).reset_index(drop=True)
        except:
            pass
    
        Tickdata_MatchTrades_TB = pd.concat([Tickdata_MatchTrades_TB, pd.DataFrame([row])], ignore_index=True)
        
        if len(Tickdata_MatchTrades_TB) >= int(n_ticks_per_min/5):
            TickBar = generate_tick_bars(Tickdata_MatchTrades_TB)
            TickBar["ticks"] = len(Tickdata_MatchTrades_TB)
            TickBars_df = pd.concat([TickBars_df, TickBar], ignore_index=True)
            Tickdata_MatchTrades_TB = pd.DataFrame()

    TickBars_df["ticks"] = TickBars_df["volume"].astype("int16")
    TickBars_df["timestamp"] =  TickBars_df["timestamp"].apply(lambda x: int(x.timestamp()* 1000))
    
    ## ------------------------------------------------- ##
    ##  (3)  End                                         ##
    ## ------------------------------------------------- ##
    
    ## ------------------------------------------------- ##
    ##  (4)  Save data                                   ##
    ## ------------------------------------------------- ##
    
    raw_data = TickBars_df.to_dict(orient="records")
    json_raw_data = json_util.dumps(raw_data)
    
    end_ts = float(TickBars_df.iloc[0]["timestamp"])/1000
    start_ts = float(TickBars_df.iloc[-1]["timestamp"])/1000
    
    with open("TickBars/"+"TickBars_df "+ datetime.fromtimestamp(end_ts).strftime("%Y-%m-%d %H:%M:%S") + " to " + datetime.fromtimestamp(start_ts).strftime("%Y-%m-%d %H:%M:%S") +".json", "w") as f:
                f.write(json_raw_data)
    
    ## ------------------------------------------------- ##
    ##  (4)  End                                         ##
    ## ------------------------------------------------- ##


['MatchTrades 2025-08-18 23:37:27 to 2025-08-21 13:36:43.json']


# (3) Feature Preparation

## This code is used to:
- Calculate the properties from Tick bars:
    - Relative strength index (RSI).
    - Average true range (ATR).
    - Average directional index (ADI).
    - Simple moving averages (SMA).
    - Exponential moving averages (EMA).
- Create signals for entry and exiting the market:
    - Simple moving averages (SMA) (9,21).
    - Exponential moving averages (EMA) (9,21).
    - Relative strength index (RSI) (30,70).

In [356]:
files_tickbars = os.listdir("TickBars")
files_tickbars = [f for f in files_tickbars if f != '.ipynb_checkpoints']
files_tickbars = files_tickbars[:1]
print(files_tickbars)

['TickBars_df 2025-08-18 23:37:27 to 2025-08-21 13:36:28.json']


In [357]:
for file in files_tickbars:

    # Load the data
    with open("TickBars/"+file) as f:
        TickBars_df = pd.read_json(f)
        TickBars_df["timestamp"] = pd.to_datetime(TickBars_df["timestamp"], unit='ms')
        
    # calculating the predictors for 
    calc_pred = predictors(TickBars_df)
    pred_df = pd.DataFrame(calc_pred.get_predictors())
    pred_df["timestamp"] =  pred_df["timestamp"].apply(lambda x: int(x.timestamp()* 1000))
    
    ##########################################
    ##              Save data               ##
    
    raw_data = pred_df.to_dict(orient="records")
    json_raw_data = json_util.dumps(raw_data)
    
    end_ts = float(pred_df.iloc[0]["timestamp"])/1000
    start_ts = float(pred_df.iloc[-1]["timestamp"])/1000
    
    with open("Features/"+"Features df "+ datetime.fromtimestamp(end_ts).strftime("%Y-%m-%d %H:%M:%S") + " to " + datetime.fromtimestamp(start_ts).strftime("%Y-%m-%d %H:%M:%S") +".json", "w") as f:
                f.write(json_raw_data)
    
    ##########################################
    ##########################################
    
    # removing the firsts rows with Nan values
    signal_df = pred_df.iloc[26:]

    # -------------------------------------------------------------------------------- #
    #             creating a signal to buy and sell from the predictors                #
    # -------------------------------------------------------------------------------- #
    

    # SMA 9,21 signals
    
    # 1 = SMA 9 crosses above SMA 21 (bullish)
    signal_df['Signal SMA'] = ((signal_df['SMA 9'] > signal_df['SMA 21']) & 
                   (signal_df['SMA 9'].shift(1) <= signal_df['SMA 21'].shift(1))).astype(int)
    
    # -1 = SMA 9 crosses below SMA 21 (bearish)
    signal_df['Signal SMA'] -= ((signal_df['SMA 9'] < signal_df['SMA 21']) & 
                    (signal_df['SMA 9'].shift(1) >= signal_df['SMA 21'].shift(1))).astype(int)
    
    # Bullish crossover: EMA 9 crosses ABOVE EMA 21
    signal_df.loc[
        (signal_df['EMA 9'] > signal_df['EMA 21']) & 
        (signal_df['EMA 9'].shift(1) <= signal_df['EMA 21'].shift(1)),
        "Signal EMA"
    ] = 1
    
    # Bearish crossover: EMA 9 crosses BELOW EMA 21
    signal_df.loc[
        (signal_df['EMA 9'] < signal_df['EMA 21']) & 
        (signal_df['EMA 9'].shift(1) >= signal_df['EMA 21'].shift(1)),
        "Signal EMA"
    ] = -1
    
    # Fill the rest with 0 (no signal)
    signal_df["Signal EMA"] = signal_df["Signal EMA"].fillna(0).astype(int)
    
    # Generate signals for RSI
    signal_df['Signal RSI'] = np.where(signal_df['RSI'] < 30, 1, np.where(signal_df['RSI'] > 70, -1, 0))
    
    ##########################################
    ##          Save signals data           ##
    
    raw_data = signal_df.to_dict(orient="records")
    json_raw_data = json_util.dumps(raw_data)
    
    end_ts = float(signal_df.iloc[0]["timestamp"])/1000
    start_ts = float(signal_df.iloc[-1]["timestamp"])/1000
    
    with open("Signals/"+"Signals_df "+ datetime.fromtimestamp(end_ts).strftime("%Y-%m-%d %H:%M:%S") + " to " + datetime.fromtimestamp(start_ts).strftime("%Y-%m-%d %H:%M:%S") +".json", "w") as f:
                f.write(json_raw_data)

    ##########################################
    ##########################################
    
    # -------------------------------------------------------------------------------- #
    #                                                                                  #
    # -------------------------------------------------------------------------------- #
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  signal_df['Signal SMA'] = ((signal_df['SMA 9'] > signal_df['SMA 21']) &
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  signal_df['Signal SMA'] -= ((signal_df['SMA 9'] < signal_df['SMA 21']) &
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  signal_df.loc[
A value is trying to be set on a copy of a sl

In [362]:
type(pred_df.iloc[26]["ADX"])

numpy.float64