# Install dependencies

In [33]:
%pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


# imports

In [34]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import random
import os
from datetime import datetime, timedelta
import tradermade
import yfinance as yf
from typing import List, TypedDict, Union, Dict

# Constants

In [35]:
# TRADERMADE_API_KEY = os.getenv("TRADERMADE_API_KEY")
# CURRENCY = "XAUUSD"
# REQUIRED_COLUMNS = ['Open', 'High', 'Low', 'Close']
# TICKER = "SPY"
# END_DATE = datetime.now()
# START_DATE = END_DATE - timedelta(days=365)
# INTERVAL = "1d"

START_TIME = "2025-07-14 09:30:00"
END_TIME = "2025-07-14 16:00:00"

# Initialization

In [36]:
# tradermade.set_rest_api_key(TRADERMADE_API_KEY)

# Get unstructured data (tick level)

## Get from yfinance (bars)

In [37]:
# df = yf.download(TICKER, start=START_DATE, end=END_DATE, interval=INTERVAL, auto_adjust=False, ignore_tz=True)

### Preprocessing data

#### Handle MultiIndex columns

In [38]:
# if isinstance(df.columns, pd.MultiIndex):
#     df = df.xs(TICKER, axis=1, level=1)
#     df.columns = [col.title() for col in df.columns]
# else:
#     df.columns = [col.title() for col in df.columns]

#### Convert required columns to numeric and drop rows with NaN

In [39]:
# df[REQUIRED_COLUMNS] = df[REQUIRED_COLUMNS].apply(pd.to_numeric, errors='coerce')
# df = df.dropna(subset=REQUIRED_COLUMNS)
# df.index = df.index.tz_localize(None)

## Mock data

In [40]:
def generate_random_trade_times(n_trades, start_time, end_time):
    start_ts = pd.to_datetime(start_time)
    end_ts = pd.to_datetime(end_time)

    total_seconds = int((end_ts - start_ts).total_seconds())

    # Generate n_trades random seconds within the time range
    random_seconds = sorted(random.sample(range(total_seconds), n_trades))
    
    return [start_ts + timedelta(seconds=s) for s in random_seconds]


def generate_mock_trades(num_trades=1000, start_price=100.0, start_time=START_TIME, end_time=END_TIME):
    timestamps = generate_random_trade_times(
        n_trades=1000,
        start_time=start_time,
        end_time=end_time
    )

    prices = [start_price]
    for _ in range(1, num_trades):
        # Simulate small price changes
        change = np.random.normal(loc=0, scale=0.05)
        prices.append(round(prices[-1] + change, 2))

    volumes = np.random.randint(1, 1000, size=num_trades)

    df = pd.DataFrame({
        'timestamp': timestamps,
        'price': prices,
        'volume': volumes
    })
    return df

df = generate_mock_trades()[::-1]
df

Unnamed: 0,timestamp,price,volume
999,2025-07-14 15:59:47,101.19,52
998,2025-07-14 15:59:31,101.25,668
997,2025-07-14 15:59:23,101.13,606
996,2025-07-14 15:59:15,101.18,250
995,2025-07-14 15:59:10,101.11,686
...,...,...,...
4,2025-07-14 09:31:59,99.95,259
3,2025-07-14 09:31:29,99.96,251
2,2025-07-14 09:30:51,99.93,926
1,2025-07-14 09:30:43,99.96,250


# Convert unstructured data to bars

In [41]:
def convert_to_bars(trade_groups: List[Dict[str, Union[datetime, float, int]]]) -> pd.DataFrame: # start_time, end_time, open, close, high, low, volume
    bars = [
        {
            "start_time": group["timestamp"].iloc[0],
            "end_time": group["timestamp"].iloc[-1],
            "open": group["price"].iloc[0],
            "close": group["price"].iloc[-1],
            "high": group["price"].max(),
            "low": group["price"].min(),
            "volume": group["volume"].sum()
        } 
    for group in trade_groups]
    
    return pd.DataFrame(bars)

## Tick bars

In [42]:
def generate_tick_bars(unstructured_data, sampling_rate: int=5) -> pd.DataFrame: 
    unstructured_data = unstructured_data.sort_values("timestamp").reset_index(drop=True)
    # Grouping
    trade_groups = [unstructured_data.iloc[i:i + sampling_rate] for i in range(0, len(unstructured_data), sampling_rate)]
    # Generating bars
    return convert_to_bars(trade_groups)

tick_bars = generate_tick_bars(df)

## Time bars

In [43]:
def generate_time_bars(unstructured_data, sampling_rate: int=60, fill_empty: bool=False) -> pd.DataFrame:    
    unstructured_data_copy = unstructured_data.copy()
    unstructured_data_copy["timestamp"] = pd.to_datetime(unstructured_data_copy["timestamp"])
    unstructured_data_copy = unstructured_data_copy.set_index("timestamp")
    
    # Perform resampling and calculate OHLCV
    bars = unstructured_data_copy.resample(f"{sampling_rate}s").agg({
        "price": ["first", "max", "min", "last"],
        "volume": "sum"
    })

    bars.columns = ["open", "high", "low", "close", "volume"]
    bars = bars.reset_index()
    
    bars["start_time"] = bars["timestamp"]
    bars["end_time"] = bars["start_time"] + timedelta(seconds=sampling_rate) - timedelta(milliseconds=1)
    bars = bars.drop(columns=["timestamp"])

    # Fill empty time slots
    if fill_empty:
        empty_mask = bars["close"].isna()
        bars["close"] = bars["close"].ffill()

        for col in ["open", "high", "low"]:
            bars.loc[empty_mask, col] = bars.loc[empty_mask, "close"]

        bars["volume"] = bars["volume"].fillna(0)
        
    # Reorder columns
    cols = ["start_time", "end_time"] + [col for col in bars.columns if col not in ["start_time", "end_time"]]
    bars = bars[cols]
    return bars

time_bars = generate_time_bars(df)

## Volume bars

In [44]:
def generate_volume_bars(unstructured_data, sampling_rate: int=1000) -> pd.DataFrame:
    unstructured_data = unstructured_data.copy()
    unstructured_data["timestamp"] = pd.to_datetime(unstructured_data["timestamp"])
    unstructured_data = unstructured_data.sort_values("timestamp").reset_index(drop=True)
    
    bars = []
    cum_volume = 0
    bar_trades = []

    for idx, row in unstructured_data.iterrows():
        bar_trades.append(row)
        cum_volume += row["volume"]

        if cum_volume >= sampling_rate:
            group_df = pd.DataFrame(bar_trades)
            group_df = group_df.sort_values("timestamp")

            start_time = group_df["timestamp"].iloc[0]
            end_time = group_df["timestamp"].iloc[-1]

            bars.append({
                "start_time": start_time,
                "end_time": end_time,
                "open": group_df["price"].iloc[0],
                "close": group_df["price"].iloc[-1],
                "high": group_df["price"].max(),
                "low": group_df["price"].min(),
                "volume": group_df["volume"].sum()
            })

            # reset for next bar
            bar_trades = []
            cum_volume = 0

    # Handle leftover trades if any (optional)
    if bar_trades:
        group_df = pd.DataFrame(bar_trades)
        group_df = group_df.sort_values("timestamp")

        start_time = group_df["timestamp"].iloc[0]
        end_time = group_df["timestamp"].iloc[-1]

        bars.append({
            "start_time": start_time,
            "end_time": end_time,
            "open": group_df["price"].iloc[0],
            "close": group_df["price"].iloc[-1],
            "high": group_df["price"].max(),
            "low": group_df["price"].min(),
            "volume": group_df["volume"].sum()
        })

    bars_df = pd.DataFrame(bars)

    # Reorder columns
    cols = ["start_time", "end_time", "open", "close", "high", "low", "volume"]
    bars_df = bars_df[cols]
    return bars_df

volume_bars = generate_volume_bars(df)

## Dollar bars

In [45]:
def generate_dollar_bars(unstructured_data, sampling_rate: int=100000) -> pd.DataFrame:
    unstructured_data = unstructured_data.copy()
    unstructured_data["timestamp"] = pd.to_datetime(unstructured_data["timestamp"])
    unstructured_data = unstructured_data.sort_values("timestamp").reset_index(drop=True)
    
    bars = []
    cum_dollars = 0
    bar_trades = []

    for idx, row in unstructured_data.iterrows():
        bar_trades.append(row)
        cum_dollars += row["volume"] * row['price']

        if cum_dollars >= sampling_rate:
            group_df = pd.DataFrame(bar_trades)
            group_df = group_df.sort_values("timestamp")

            start_time = group_df["timestamp"].iloc[0]
            end_time = group_df["timestamp"].iloc[-1]

            bars.append({
                "start_time": start_time,
                "end_time": end_time,
                "open": group_df["price"].iloc[0],
                "close": group_df["price"].iloc[-1],
                "high": group_df["price"].max(),
                "low": group_df["price"].min(),
                "volume": group_df["volume"].sum()
            })

            # reset for next bar
            bar_trades = []
            cum_dollars = 0

    # Handle leftover trades if any (optional)
    if bar_trades:
        group_df = pd.DataFrame(bar_trades)
        group_df = group_df.sort_values("timestamp")

        start_time = group_df["timestamp"].iloc[0]
        end_time = group_df["timestamp"].iloc[-1]

        bars.append({
            "start_time": start_time,
            "end_time": end_time,
            "open": group_df["price"].iloc[0],
            "close": group_df["price"].iloc[-1],
            "high": group_df["price"].max(),
            "low": group_df["price"].min(),
            "volume": group_df["volume"].sum()
        })

    bars_df = pd.DataFrame(bars)

    # Reorder columns
    cols = ["start_time", "end_time", "open", "close", "high", "low", "volume"]
    bars_df = bars_df[cols]
    return bars_df

dollar_bars = generate_dollar_bars(df)

# Plot bars

## Tick bars

### Bar Chart

In [46]:
fig = go.Figure(data=[go.Candlestick(
    x=tick_bars.index,
    open=tick_bars['open'],
    high=tick_bars['high'],
    low=tick_bars['low'],
    close=tick_bars['close'],
    name='Tick Bars'
)])

fig.update_layout(
    title=f'Tick Bars',
    yaxis_title='Price (USD)',
    xaxis_title='Bar',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

### Time chart

In [47]:
fig = go.Figure(data=[go.Candlestick(
    x=tick_bars['start_time'],
    open=tick_bars['open'],
    high=tick_bars['high'],
    low=tick_bars['low'],
    close=tick_bars['close'],
    name='Tick Bars'
)])

fig.update_layout(
    title=f'Tick Bars',
    yaxis_title='Price (USD)',
    xaxis_title='Bar',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

## Time bars

### Bar Chart

In [48]:
fig = go.Figure(data=[go.Candlestick(
    x=time_bars.index,
    open=time_bars['open'],
    high=time_bars['high'],
    low=time_bars['low'],
    close=time_bars['close'],
    name='Time Bars'
)])

fig.update_layout(
    title=f'Time Bars',
    yaxis_title='Price (USD)',
    xaxis_title='Bar',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

### Time Chart

In [49]:
fig = go.Figure(data=[go.Candlestick(
    x=time_bars['start_time'],
    open=time_bars['open'],
    high=time_bars['high'],
    low=time_bars['low'],
    close=time_bars['close'],
    name='Time Bars'
)])

fig.update_layout(
    title=f'Time Bars',
    yaxis_title='Price (USD)',
    xaxis_title='Bar',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

## Volume bars

### Bar Chart

In [50]:
fig = go.Figure(data=[go.Candlestick(
    x=volume_bars.index,
    open=volume_bars['open'],
    high=volume_bars['high'],
    low=volume_bars['low'],
    close=volume_bars['close'],
    name='Volume Bars'
)])

fig.update_layout(
    title=f'Volume Bars',
    yaxis_title='Price (USD)',
    xaxis_title='Bar',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

### Time Chart

In [51]:
fig = go.Figure(data=[go.Candlestick(
    x=volume_bars['start_time'],
    open=volume_bars['open'],
    high=volume_bars['high'],
    low=volume_bars['low'],
    close=volume_bars['close'],
    name='Volume Bars'
)])

fig.update_layout(
    title=f'Volume Bars',
    yaxis_title='Price (USD)',
    xaxis_title='Bar',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

## Dollar bars

### Bar Chart

In [52]:
fig = go.Figure(data=[go.Candlestick(
    x=dollar_bars.index,
    open=dollar_bars['open'],
    high=dollar_bars['high'],
    low=dollar_bars['low'],
    close=dollar_bars['close'],
    name='Dollar Bars'
)])

fig.update_layout(
    title=f'Dollar Bars',
    yaxis_title='Price (USD)',
    xaxis_title='Bar',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

### Time Chart

In [53]:
fig = go.Figure(data=[go.Candlestick(
    x=dollar_bars['start_time'],
    open=dollar_bars['open'],
    high=dollar_bars['high'],
    low=dollar_bars['low'],
    close=dollar_bars['close'],
    name='Dollar Bars'
)])

fig.update_layout(
    title=f'Dollar Bars',
    yaxis_title='Price (USD)',
    xaxis_title='Bar',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

# Labeling Bars

## Fixed-time Horizon Labeling

In [67]:
def fixed_time_horizon_labeling(bars, threshold: float=0.1): # in percent
    percent_changed = ((bars['close'] / bars['open']) - 1) * 100 # Percentage of change
    passed_threshold = abs(percent_changed) > threshold
    bars['label'] = 0
    bars.loc[passed_threshold, 'label'] = abs(percent_changed) / percent_changed
    return bars

fixed_time_horizon_labeling(time_bars, 0.04)

Unnamed: 0,start_time,end_time,open,high,low,close,volume,label
0,2025-07-14 09:30:00,2025-07-14 09:30:59.999,100.00,100.00,99.93,99.93,2057,-1
1,2025-07-14 09:31:00,2025-07-14 09:31:59.999,99.96,99.96,99.95,99.95,510,0
2,2025-07-14 09:32:00,2025-07-14 09:32:59.999,99.85,99.85,99.81,99.81,760,-1
3,2025-07-14 09:33:00,2025-07-14 09:33:59.999,,,,,0,0
4,2025-07-14 09:34:00,2025-07-14 09:34:59.999,99.81,99.90,99.81,99.90,760,1
...,...,...,...,...,...,...,...,...
385,2025-07-14 15:55:00,2025-07-14 15:55:59.999,100.73,100.76,100.73,100.76,812,0
386,2025-07-14 15:56:00,2025-07-14 15:56:59.999,100.86,100.91,100.86,100.91,102,1
387,2025-07-14 15:57:00,2025-07-14 15:57:59.999,100.89,101.09,100.89,101.09,3040,1
388,2025-07-14 15:58:00,2025-07-14 15:58:59.999,,,,,0,0
