In [1]:
import poly_api


markets = [
    "0x0cedbcc216d551bbb704f18cbad8932ea427139760e3dcf715c4a134c875474e",
    "0x15dbb75d18291f1341ef0d33328a3bcc72cc0872b4dd478a9ce9b00b6dd9dd2c",
    "0x2118aa01cc2345896b160331291cf0e1d4146da1d9413f1aa214ffbdd0134937",
    "0x2dded342861dd48b5a239c51a9b7b1cfee20cb1641c875eec3388807cea843b8",
    "0x453157d57aa34741278cb3d6f20d3d01b138395c3ae9eb34e2b97137fe0f2eaa",
    "0x62157086c05e1f5ae7c72ae72c0c1fe62cf4d0b045ceefc18357802bcd005ef8",
    "0x7a18e4d613d7bf6afa06aa9eb1f9287ca81e841d77ae6d5fabdab37f0c0b5d6c",
    "0x9d84821a6c8b45fcd9dad9f50f1b0fc6cb76de7a68d7686bfefba697c32a6375",
    "0xa356cb30609f25eab2219e0aeeb64f9e5de471213427ff911264892a205e1c57",
    "0xad172e84e2a01b30406245a32cade80ad56eb6941ed9e5b22c73b1b206dc7e11",
    "0xc9501eac519c7b631d0425ea093a127f4552ad52b8fdf4e591cea89b31aad981",
    "0xedf6eed432b16b5473929350ee322fed560a4ba4a70785ed06331eac724e7826",
    "0xef3446eac0c8baefadf48c8007429643bdb3d81fbcee4074395600cc40a7c682",
    "0xf9d0a1390e11c9119cf084b3b86bd883052932a951bf933cf23bdd1c0700bebd",
]
markets_bodies = [poly_api.poly_client.get_market(m) for m in markets]

tokens: list[str] = []
for market in markets_bodies:
    for token in market["tokens"]:
        tokens.append(token["token_id"])

In [2]:
from datetime import datetime

from poly_dataset_loader import (
    query_token_timeseries_by_tokens,
    query_asks_and_bids_by_timeseries,
)

ts = query_token_timeseries_by_tokens(tokens=tokens[9:10])
ts = [(item[0], datetime.fromisoformat(item[1].split('.')[0]), item[2]) for item in ts]
ts[0:10]

[(136, datetime.datetime(2025, 4, 18, 10, 14, 1), 12),
 (164, datetime.datetime(2025, 4, 18, 10, 15, 2), 12),
 (192, datetime.datetime(2025, 4, 18, 10, 16, 2), 12),
 (220, datetime.datetime(2025, 4, 18, 10, 17, 2), 12),
 (248, datetime.datetime(2025, 4, 18, 10, 18, 2), 12),
 (276, datetime.datetime(2025, 4, 18, 10, 20, 7), 12),
 (304, datetime.datetime(2025, 4, 18, 10, 20, 13), 12),
 (332, datetime.datetime(2025, 4, 18, 10, 22, 8), 12),
 (360, datetime.datetime(2025, 4, 18, 10, 22, 28), 12),
 (388, datetime.datetime(2025, 4, 18, 10, 24, 8), 12)]

In [3]:
len(ts)

3653

In [4]:
str(min(ts, key=lambda x: x[1])[1]), str(max(ts, key=lambda x: x[1])[1])

('2025-04-18 10:14:01', '2025-04-20 23:27:01')

In [19]:
import sqlite3

import pandas as pd
from poly_dataset_loader import create_tables, get_database_dir


db_dir = get_database_dir()
conn = sqlite3.connect(db_dir / "order_book.db")
cursor = conn.cursor()
create_tables(cursor)

ts = pd.read_sql_query(
    """
    SELECT timeseries_id, timestamp, token_id
    FROM token_timeseries
    """,
    conn,
)
ts['timestamp'] = pd.to_datetime(ts['timestamp'])

asks = pd.read_sql_query(
    "SELECT timeseries_id, MAX(price) as ask FROM asks GROUP BY timeseries_id", conn
)
bids = pd.read_sql_query(
    "SELECT timeseries_id, MIN(price) as bid FROM bids GROUP BY timeseries_id", conn
)

conn.close()

asks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76557 entries, 0 to 76556
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   timeseries_id  76557 non-null  int64  
 1   ask            76557 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.2 MB


In [20]:
ts.info(), asks.info(), bids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102408 entries, 0 to 102407
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   timeseries_id  102408 non-null  int64         
 1   timestamp      102408 non-null  datetime64[ns]
 2   token_id       102408 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 2.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76557 entries, 0 to 76556
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   timeseries_id  76557 non-null  int64  
 1   ask            76557 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76557 entries, 0 to 76556
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   timeseries_id  76557 non-null  int64  

(None, None, None)

In [21]:
ts_with_huge_delay = ts[ts["timestamp"].dt.second > 10]
ts_with_huge_delay.info()
ts_with_huge_delay.head()

<class 'pandas.core.frame.DataFrame'>
Index: 2160 entries, 14 to 56095
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   timeseries_id  2160 non-null   int64         
 1   timestamp      2160 non-null   datetime64[ns]
 2   token_id       2160 non-null   int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 67.5 KB


Unnamed: 0,timeseries_id,timestamp,token_id
14,15,2025-04-18 09:19:33.791081,1
15,16,2025-04-18 09:19:33.791081,2
62,63,2025-04-18 09:43:12.286864,1
63,64,2025-04-18 09:43:12.286864,2
292,293,2025-04-18 10:20:13.974474,1


In [26]:
ts_filtered = ts.loc[~ts.index.isin(ts_with_huge_delay.index)].copy()

ts_filtered['timestamp'] = ts_filtered['timestamp'].dt.floor('min')

ts_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100248 entries, 0 to 102407
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   timeseries_id  100248 non-null  int64         
 1   timestamp      100248 non-null  datetime64[ns]
 2   token_id       100248 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 3.1 MB


In [27]:
ts_a = ts_filtered.merge(asks, on='timeseries_id', how='left')
ts_ab = ts_a.merge(bids, on='timeseries_id', how='left')
ts_ab.info()
ts_ab.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100248 entries, 0 to 100247
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   timeseries_id  100248 non-null  int64         
 1   timestamp      100248 non-null  datetime64[ns]
 2   token_id       100248 non-null  int64         
 3   ask            74881 non-null   float64       
 4   bid            74881 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 3.8 MB


Unnamed: 0,timeseries_id,timestamp,token_id,ask,bid
0,1,2025-04-18 09:12:00,1,0.8,0.01
1,2,2025-04-18 09:12:00,2,0.99,0.2
2,3,2025-04-18 09:13:00,1,0.8,0.01
3,4,2025-04-18 09:13:00,2,0.99,0.2
4,5,2025-04-18 09:14:00,1,0.8,0.01


In [30]:
ts_ab_filtered = ts_ab.dropna()
ts_ab_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70970 entries, 0 to 100247
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   timeseries_id  70970 non-null  int64         
 1   timestamp      70970 non-null  datetime64[ns]
 2   token_id       70970 non-null  int64         
 3   ask            70970 non-null  float64       
 4   bid            70970 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 3.2 MB


In [32]:
import numpy as np


delta = (ts_ab_filtered["ask"] - ts_ab_filtered["bid"])
np.mean(delta), np.std(delta), np.min(delta), np.max(delta)

(np.float64(0.02003883330984924),
 np.float64(0.03967914945818073),
 np.float64(0.003),
 np.float64(0.98))

In [31]:

df_by_token = [
    group.set_index('timestamp')
    for token_id, group in ts_ab_filtered.groupby('token_id')
]

print(f"Created {len(df_by_token)} groups")
df_by_token[0].info()
df_by_token[0].head()

Created 22 groups
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3636 entries, 2025-04-18 09:12:00 to 2025-04-20 23:27:00
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   timeseries_id  3636 non-null   int64  
 1   token_id       3636 non-null   int64  
 2   ask            3636 non-null   float64
 3   bid            3636 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 142.0 KB


Unnamed: 0_level_0,timeseries_id,token_id,ask,bid
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-04-18 09:12:00,1,1,0.8,0.01
2025-04-18 09:13:00,3,1,0.8,0.01
2025-04-18 09:14:00,5,1,0.8,0.01
2025-04-18 09:15:00,7,1,0.8,0.01
2025-04-18 09:16:00,9,1,0.8,0.01
