# Install dependencies

In [195]:
%pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


# imports

In [196]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import random
import os
from datetime import datetime, timedelta
import tradermade
import yfinance as yf
from typing import List, TypedDict, Union, Dict

# Constants

In [197]:
# TRADERMADE_API_KEY = os.getenv("TRADERMADE_API_KEY")
# CURRENCY = "XAUUSD"
# REQUIRED_COLUMNS = ['Open', 'High', 'Low', 'Close']
# TICKER = "SPY"
# END_DATE = datetime.now()
# START_DATE = END_DATE - timedelta(days=365)
# INTERVAL = "1d"

START_TIME = "2025-07-14 09:30:00"
END_TIME = "2025-07-14 16:00:00"

# Initialization

In [198]:
# tradermade.set_rest_api_key(TRADERMADE_API_KEY)

# Get unstructured data (tick level)

## Get from yfinance (bars)

In [199]:
# df = yf.download(TICKER, start=START_DATE, end=END_DATE, interval=INTERVAL, auto_adjust=False, ignore_tz=True)

### Preprocessing data

#### Handle MultiIndex columns

In [200]:
# if isinstance(df.columns, pd.MultiIndex):
#     df = df.xs(TICKER, axis=1, level=1)
#     df.columns = [col.title() for col in df.columns]
# else:
#     df.columns = [col.title() for col in df.columns]

#### Convert required columns to numeric and drop rows with NaN

In [201]:
# df[REQUIRED_COLUMNS] = df[REQUIRED_COLUMNS].apply(pd.to_numeric, errors='coerce')
# df = df.dropna(subset=REQUIRED_COLUMNS)
# df.index = df.index.tz_localize(None)

## Mock data

In [202]:
def generate_random_trade_times(n_trades, start_time, end_time):
    start_ts = pd.to_datetime(start_time)
    end_ts = pd.to_datetime(end_time)

    total_seconds = int((end_ts - start_ts).total_seconds())

    # Generate n_trades random seconds within the time range
    random_seconds = sorted(random.sample(range(total_seconds), n_trades))
    
    return [start_ts + timedelta(seconds=s) for s in random_seconds]


def generate_mock_trades(num_trades=1000, start_price=100.0, start_time=START_TIME, end_time=END_TIME):
    timestamps = generate_random_trade_times(
        n_trades=1000,
        start_time=start_time,
        end_time=end_time
    )

    prices = [start_price]
    for _ in range(1, num_trades):
        # Simulate small price changes
        change = np.random.normal(loc=0, scale=0.05)
        prices.append(round(prices[-1] + change, 2))

    volumes = np.random.randint(1, 1000, size=num_trades)

    df = pd.DataFrame({
        'timestamp': timestamps,
        'price': prices,
        'volume': volumes
    })
    return df

df = generate_mock_trades()[::-1]
df

Unnamed: 0,timestamp,price,volume
999,2025-07-14 15:59:49,100.23,704
998,2025-07-14 15:59:29,100.22,385
997,2025-07-14 15:59:18,100.27,219
996,2025-07-14 15:59:17,100.26,43
995,2025-07-14 15:59:15,100.29,881
...,...,...,...
4,2025-07-14 09:30:36,100.14,508
3,2025-07-14 09:30:34,100.14,732
2,2025-07-14 09:30:29,100.19,322
1,2025-07-14 09:30:09,100.12,621


# Convert unstructured data to bars

In [203]:
def convert_to_bars(trade_groups: List[Dict[str, Union[datetime, float, int]]]) -> pd.DataFrame: # start_time, end_time, open, close, high, low, volume
    bars = [
        {
            "start_time": group["timestamp"].iloc[0],
            "end_time": group["timestamp"].iloc[-1],
            "open": group["price"].iloc[0],
            "close": group["price"].iloc[-1],
            "high": group["price"].max(),
            "low": group["price"].min(),
            "volume": group["volume"].sum()
        } 
    for group in trade_groups]
    
    return pd.DataFrame(bars)

## Tick bars

In [205]:
def generate_tick_bars(unstructured_data, sampling_rate: int=5) -> pd.DataFrame: 
    unstructured_data = unstructured_data.sort_values("timestamp").reset_index(drop=True)
    # Grouping
    trade_groups = [unstructured_data.iloc[i:i + sampling_rate] for i in range(0, len(unstructured_data), sampling_rate)]
    # Generating bars
    return convert_to_bars(trade_groups)

generate_tick_bars(df)

Unnamed: 0,start_time,end_time,open,close,high,low,volume
0,2025-07-14 09:30:01,2025-07-14 09:30:36,100.00,100.14,100.19,100.00,2956
1,2025-07-14 09:31:30,2025-07-14 09:32:20,100.17,100.27,100.27,100.11,2961
2,2025-07-14 09:32:58,2025-07-14 09:33:51,100.38,100.32,100.38,100.27,3004
3,2025-07-14 09:34:01,2025-07-14 09:35:47,100.31,100.40,100.40,100.28,2360
4,2025-07-14 09:36:05,2025-07-14 09:37:03,100.43,100.44,100.44,100.34,2790
...,...,...,...,...,...,...,...
195,2025-07-14 15:51:09,2025-07-14 15:53:21,100.19,100.23,100.26,100.18,2184
196,2025-07-14 15:53:24,2025-07-14 15:54:42,100.19,100.14,100.19,100.12,4429
197,2025-07-14 15:54:55,2025-07-14 15:56:21,100.18,100.20,100.22,100.17,2487
198,2025-07-14 15:56:22,2025-07-14 15:59:13,100.23,100.32,100.32,100.23,2398


## Time bars

In [204]:
def generate_time_bars(unstructured_data, sampling_rate: int=60, fill_empty: bool=False) -> pd.DataFrame:    
    unstructured_data_copy = unstructured_data.copy()
    unstructured_data_copy["timestamp"] = pd.to_datetime(unstructured_data_copy["timestamp"])
    unstructured_data_copy = unstructured_data_copy.set_index("timestamp")
    
    # Perform resampling and calculate OHLCV
    bars = unstructured_data_copy.resample(f"{sampling_rate}s").agg({
        "price": ["first", "max", "min", "last"],
        "volume": "sum"
    })

    bars.columns = ["open", "high", "low", "close", "volume"]
    bars = bars.reset_index()
    
    bars["start_time"] = bars["timestamp"]
    bars["end_time"] = bars["start_time"] + timedelta(seconds=sampling_rate) - timedelta(milliseconds=1)
    bars = bars.drop(columns=["timestamp"])

    # Fill empty time slots
    if fill_empty:
        empty_mask = bars["close"].isna()
        bars["close"] = bars["close"].ffill()

        for col in ["open", "high", "low"]:
            bars.loc[empty_mask, col] = bars.loc[empty_mask, "close"]

        bars["volume"] = bars["volume"].fillna(0)
        
    # Reorder columns
    cols = ["start_time", "end_time"] + [col for col in bars.columns if col not in ["start_time", "end_time"]]
    bars = bars[cols]
    return bars

generate_time_bars(df)

Unnamed: 0,start_time,end_time,open,high,low,close,volume
0,2025-07-14 09:30:00,2025-07-14 09:30:59.999,100.00,100.19,100.00,100.14,2956
1,2025-07-14 09:31:00,2025-07-14 09:31:59.999,100.17,100.17,100.11,100.11,729
2,2025-07-14 09:32:00,2025-07-14 09:32:59.999,100.17,100.38,100.17,100.38,2500
3,2025-07-14 09:33:00,2025-07-14 09:33:59.999,100.36,100.36,100.27,100.32,2736
4,2025-07-14 09:34:00,2025-07-14 09:34:59.999,100.31,100.31,100.28,100.28,756
...,...,...,...,...,...,...,...
385,2025-07-14 15:55:00,2025-07-14 15:55:59.999,100.20,100.20,100.17,100.17,956
386,2025-07-14 15:56:00,2025-07-14 15:56:59.999,100.22,100.27,100.20,100.27,1643
387,2025-07-14 15:57:00,2025-07-14 15:57:59.999,100.29,100.29,100.27,100.27,1057
388,2025-07-14 15:58:00,2025-07-14 15:58:59.999,,,,,0


## Volume bars

In [None]:
def generate_volume_bars(unstructured_data, sampling_rate: int=1000) -> pd.DataFrame:
    unstructured_data = unstructured_data.copy()
    unstructured_data["timestamp"] = pd.to_datetime(unstructured_data["timestamp"])
    unstructured_data = unstructured_data.sort_values("timestamp").reset_index(drop=True)
    
    bars = []
    cum_volume = 0
    bar_trades = []

    for idx, row in unstructured_data.iterrows():
        bar_trades.append(row)
        cum_volume += row["volume"]

        if cum_volume >= sampling_rate:
            group_df = pd.DataFrame(bar_trades)
            group_df = group_df.sort_values("timestamp")

            start_time = group_df["timestamp"].iloc[0]
            end_time = group_df["timestamp"].iloc[-1]

            bars.append({
                "start_time": start_time,
                "end_time": end_time,
                "open": group_df["price"].iloc[0],
                "close": group_df["price"].iloc[-1],
                "high": group_df["price"].max(),
                "low": group_df["price"].min(),
                "volume": group_df["volume"].sum()
            })

            # reset for next bar
            bar_trades = []
            cum_volume = 0

    # Handle leftover trades if any (optional)
    if bar_trades:
        group_df = pd.DataFrame(bar_trades)
        group_df = group_df.sort_values("timestamp")

        start_time = group_df["timestamp"].iloc[0]
        end_time = group_df["timestamp"].iloc[-1]

        bars.append({
            "start_time": start_time,
            "end_time": end_time,
            "open": group_df["price"].iloc[0],
            "close": group_df["price"].iloc[-1],
            "high": group_df["price"].max(),
            "low": group_df["price"].min(),
            "volume": group_df["volume"].sum()
        })

    bars_df = pd.DataFrame(bars)

    # Reorder columns
    cols = ["start_time", "end_time", "open", "close", "high", "low", "volume"]
    bars_df = bars_df[cols]
    return bars_df

generate_volume_bars(df)

Unnamed: 0,start_time,end_time,open,close,high,low,volume
0,2025-07-14 09:30:01,2025-07-14 09:30:09,100.00,100.12,100.12,100.00,1394
1,2025-07-14 09:30:29,2025-07-14 09:30:34,100.19,100.14,100.19,100.14,1054
2,2025-07-14 09:30:36,2025-07-14 09:31:54,100.14,100.11,100.17,100.11,1237
3,2025-07-14 09:32:01,2025-07-14 09:32:15,100.17,100.26,100.26,100.17,1254
4,2025-07-14 09:32:20,2025-07-14 09:32:58,100.27,100.38,100.38,100.27,1246
...,...,...,...,...,...,...,...
374,2025-07-14 15:54:55,2025-07-14 15:55:14,100.18,100.20,100.20,100.18,1488
375,2025-07-14 15:55:49,2025-07-14 15:56:22,100.17,100.23,100.23,100.17,1544
376,2025-07-14 15:56:58,2025-07-14 15:57:50,100.27,100.27,100.29,100.27,1435
377,2025-07-14 15:59:13,2025-07-14 15:59:15,100.32,100.29,100.32,100.29,1299


## Dollar bars

In [211]:
def generate_dollar_bars(unstructured_data, sampling_rate: int=100000) -> pd.DataFrame:
    unstructured_data = unstructured_data.copy()
    unstructured_data["timestamp"] = pd.to_datetime(unstructured_data["timestamp"])
    unstructured_data = unstructured_data.sort_values("timestamp").reset_index(drop=True)
    
    bars = []
    cum_dollars = 0
    bar_trades = []

    for idx, row in unstructured_data.iterrows():
        bar_trades.append(row)
        cum_dollars += row["volume"] * row['price']

        if cum_dollars >= sampling_rate:
            group_df = pd.DataFrame(bar_trades)
            group_df = group_df.sort_values("timestamp")

            start_time = group_df["timestamp"].iloc[0]
            end_time = group_df["timestamp"].iloc[-1]

            bars.append({
                "start_time": start_time,
                "end_time": end_time,
                "open": group_df["price"].iloc[0],
                "close": group_df["price"].iloc[-1],
                "high": group_df["price"].max(),
                "low": group_df["price"].min(),
                "volume": group_df["volume"].sum()
            })

            # reset for next bar
            bar_trades = []
            cum_dollars = 0

    # Handle leftover trades if any (optional)
    if bar_trades:
        group_df = pd.DataFrame(bar_trades)
        group_df = group_df.sort_values("timestamp")

        start_time = group_df["timestamp"].iloc[0]
        end_time = group_df["timestamp"].iloc[-1]

        bars.append({
            "start_time": start_time,
            "end_time": end_time,
            "open": group_df["price"].iloc[0],
            "close": group_df["price"].iloc[-1],
            "high": group_df["price"].max(),
            "low": group_df["price"].min(),
            "volume": group_df["volume"].sum()
        })

    bars_df = pd.DataFrame(bars)

    # Reorder columns
    cols = ["start_time", "end_time", "open", "close", "high", "low", "volume"]
    bars_df = bars_df[cols]
    return bars_df

generate_dollar_bars(df)

Unnamed: 0,start_time,end_time,open,close,high,low,volume
0,2025-07-14 09:30:01,2025-07-14 09:30:09,100.00,100.12,100.12,100.00,1394
1,2025-07-14 09:30:29,2025-07-14 09:30:34,100.19,100.14,100.19,100.14,1054
2,2025-07-14 09:30:36,2025-07-14 09:31:54,100.14,100.11,100.17,100.11,1237
3,2025-07-14 09:32:01,2025-07-14 09:32:15,100.17,100.26,100.26,100.17,1254
4,2025-07-14 09:32:20,2025-07-14 09:32:58,100.27,100.38,100.38,100.27,1246
...,...,...,...,...,...,...,...
377,2025-07-14 15:55:49,2025-07-14 15:56:21,100.17,100.20,100.22,100.17,999
378,2025-07-14 15:56:22,2025-07-14 15:57:35,100.23,100.29,100.29,100.23,1074
379,2025-07-14 15:57:50,2025-07-14 15:59:13,100.27,100.32,100.32,100.27,1324
380,2025-07-14 15:59:15,2025-07-14 15:59:18,100.29,100.27,100.29,100.26,1143


# Plot bars

## Time bars

In [207]:
fig = go.Figure(data=[go.Candlestick(
    x=df.index,
    open=df['Open'],
    high=df['High'],
    low=df['Low'],
    close=df['Close'],
    name='SPY'
)])

fig.update_layout(
    title=f'SPY (S&P 500 ETF) Candlestick Chart - Last 60 Days',
    yaxis_title='Price (USD)',
    xaxis_title='Date',
    xaxis_rangeslider_visible=True, 
    template='seaborn', # other options: plotly, plotly_white, plotly_dark, simple_white
    showlegend=True,
)

KeyError: 'Open'

## Volume bars

In [None]:
# Assume each row = 1 trade (approximation)
tick_size = 1000  # Create a bar every 1000 "ticks"
tick_bars = df.iloc[::tick_size]  # Slice every N rows

cumulative_volume = df['Volume'].cumsum()
volume_bar_size = 10000  # New bar every 10,000 shares traded

# Group by volume thresholds
volume_bars = df.groupby(cumulative_volume // volume_bar_size).agg({
    'Open': 'first',
    'High': 'max',
    'Low': 'min',
    'Close': 'last',
    'Volume': 'sum'
})

import plotly.graph_objects as go

fig = go.Figure(data=[go.Candlestick(
    x=volume_bars.index,
    open=volume_bars['Open'],
    high=volume_bars['High'],
    low=volume_bars['Low'],
    close=volume_bars['Close']
)])

fig.update_layout(
    title="AAPL Volume Bars (10,000 shares/bar)",
    yaxis_title="Price",
    xaxis_title="Volume Bar Index",
    template="plotly_dark"
)
fig.show()