In [1]:
import clickhouse_connect
import os
import panel as pn
import polars as pl

pn.extension("plotly", template="material", sizing_mode="stretch_width")
pl.Config.set_fmt_str_lengths(200)
pl.Config.set_fmt_float("full")

polars.config.Config

In [2]:
# Clickhouse Format Settings - https://clickhouse.com/docs/en/integrations/python#read-formats
from clickhouse_connect.datatypes.format import set_read_format

# Return both IPv6 and IPv4 values as strings
set_read_format("IPv*", "string")

# Return binary as string
set_read_format("FixedString", "string")

# sets large ints to floats so that there are no large int overflow errors when converting to polars dataframe
set_read_format("Int*", "float")

### Clickhouse Queries

In [3]:
# Create ClickHouse client
client = clickhouse_connect.get_client(
    host=os.environ.get("HOST"),
    username=os.environ.get("USERNAME"),
    password=os.environ.get("PASSWORD"),
    secure=True,
)

In [4]:
# Check mempool for when the blob first appears
mempool_query = """ 
SELECT 
    event_date_time,
    type,
    blob_sidecars_size,
    blob_sidecars_empty_size,
    hash,
    nonce,
    meta_network_name,
    meta_client_name,
    length(blob_hashes) as blob_hashes_length
FROM mempool_transaction 
WHERE event_date_time > NOW() - INTERVAL '1 DAY'
AND type = 3 AND meta_network_name = 'mainnet'
"""
mempool_df = client.query_df(mempool_query)

In [5]:
mempool_df_pl = pl.from_pandas(mempool_df)

In [6]:
mempool_df_pl.sort(by="blob_sidecars_size").head(5)

event_date_time,type,blob_sidecars_size,blob_sidecars_empty_size,hash,nonce,meta_network_name,meta_client_name,blob_hashes_length
datetime[ms],u8,u32,u32,str,u64,str,str,u64
2024-03-13 17:17:43.401,3,131072,0,"""0x22d98b4bd35317cd3f0b8cae52873241014762f2321b460ecfda661d36a205e1""",1218,"""mainnet""","""xatu-sentry-blr1-xatu-mimicry""",1
2024-03-13 17:17:43.512,3,131072,0,"""0x22d98b4bd35317cd3f0b8cae52873241014762f2321b460ecfda661d36a205e1""",1218,"""mainnet""","""xatu-sentry-tlv-xatu-mimicry""",1
2024-03-13 17:17:43.528,3,131072,0,"""0x22d98b4bd35317cd3f0b8cae52873241014762f2321b460ecfda661d36a205e1""",1218,"""mainnet""","""xatu-sentry-ams3-xatu-mimicry""",1
2024-03-14 01:59:23.144,3,131072,0,"""0x0bfed33829811c18bd095e3490396da96335006158f521c66af7df9446ca16db""",1448,"""mainnet""","""xatu-sentry-blr1-xatu-mimicry""",1
2024-03-14 01:59:23.422,3,131072,0,"""0x0bfed33829811c18bd095e3490396da96335006158f521c66af7df9446ca16db""",1448,"""mainnet""","""xatu-sentry-scl-xatu-mimicry""",1


In [7]:
beacon_execution_query = """
SELECT 
    *
FROM canonical_beacon_block_execution_transaction 
WHERE event_date_time > NOW() - INTERVAL '1 DAY'
AND type = 3 AND meta_network_name = 'mainnet'
"""

beacon_df_df = client.query_df(beacon_execution_query)

In [8]:
beacon_df_df_pl = pl.from_pandas(beacon_df_df.drop("meta_labels", axis=1))

In [9]:
beacon_df_df_pl.group_by("slot").agg(
    [
        pl.col("slot_start_date_time").min(),
        pl.len().alias("blob_length"),
        pl.col("blob_sidecars_size").mean(),
        pl.col("blob_sidecars_empty_size").mean(),
        pl.col("hash").first(),
    ]
).with_columns(
    (pl.col("blob_sidecars_empty_size") / pl.col("blob_sidecars_size")).alias(
        "blob_util_percent"
    )
).sort(
    by="blob_util_percent", descending=True
)

slot,slot_start_date_time,blob_length,blob_sidecars_size,blob_sidecars_empty_size,hash,blob_util_percent
u32,datetime[ms],u32,f64,f64,str,f64
8626277,2024-03-13 14:15:47,1,524288,524288,"""0x289348c0dde54961840ed6a49c5c873625858ed1cabc249356a34572853cfe4b""",1
8626298,2024-03-13 14:19:59,1,262144,262144,"""0x5d4e98a5d845a86fcd1739872379f0574cdcfd9676a6dc57bff01c1c0edb6d13""",1
8626248,2024-03-13 14:09:59,1,131072,131072,"""0x46b13e19dff432cbb44f9509bf42953797ee522d12a61ae3a5ed5cb13e29ab63""",1
8626254,2024-03-13 14:11:11,1,786432,786432,"""0x586f74a0efcc5c20aa33c406b3566c5a04acd9342094bf0d0847cbacd0e05c14""",1
8626263,2024-03-13 14:12:59,1,786432,786432,"""0x3cdaa359b0f9bdbe7e1d7a196d495c261ac0a35cd7d4191288e97b07a0fcae4e""",1
…,…,…,…,…,…,…
8629114,2024-03-13 23:43:11,1,131072,0,"""0x9f7a822402ab7f24ee16e4918a5ee077276f92901fb798359fceee389c1b6e45""",0
8629629,2024-03-14 01:26:11,1,131072,0,"""0x894dbe12a461a8a9bfc40a1bc6b367ab38d8ac6960f56d184b13aed1094042d8""",0
8629370,2024-03-14 00:34:23,1,131072,0,"""0x75c654c0854e21464dafab2dd919e74c385ff4f2bab0625eeffe9e4451c792c1""",0
8629760,2024-03-14 01:52:23,1,131072,0,"""0xe4944ea39d8f8746f504a9f87aa6ab17753ca97abc75e2d83aa5cfb1b74b4fe1""",0


In [10]:
beacon_df_df_pl.group_by("slot").agg(
    [
        pl.col("slot_start_date_time").min(),
        pl.len().alias("blob_length"),
        pl.col("blob_sidecars_size").mean(),
        pl.col("blob_sidecars_empty_size").mean(),
    ]
).with_columns(
    (pl.col("blob_sidecars_empty_size") / pl.col("blob_sidecars_size")).alias(
        "blob_util_percent"
    )
).sort(
    by="blob_util_percent", descending=True
).plot.hist(
    "blob_length"
)

In [11]:
beacon_df_df_pl.group_by("slot").agg(
    [
        pl.col("slot_start_date_time").min(),
        pl.len().alias("blob_length"),
        pl.col("blob_sidecars_size").mean(),
        pl.col("blob_sidecars_empty_size").mean(),
    ]
).with_columns(
    (pl.col("blob_sidecars_empty_size") / pl.col("blob_sidecars_size")).alias(
        "blob_util_percent"
    )
).sort(
    by="blob_util_percent", descending=True
).plot.scatter(
    x="slot", y="blob_util_percent", title="blob util percent by slot"
)

In [12]:
# block + blob size
block_size_query = """
SELECT 
    a.event_date_time,
    a.slot,
    a.slot_start_date_time,
    a.epoch,
    a.epoch_start_date_time,
    a.block_total_bytes,
    a.block_total_bytes_compressed,
    a.block_root AS block_root_block,
    a.execution_payload_block_number,
    a.execution_payload_transactions_count,
    a.execution_payload_transactions_total_bytes,
    a.execution_payload_transactions_total_bytes_compressed,
    b.blob_index,
    b.blob_size,
    a.meta_network_name AS meta_network_name_block,
    b.meta_network_name AS meta_network_name_blob
FROM 
    (SELECT 
        event_date_time,
        slot,
        slot_start_date_time,
        epoch,
        epoch_start_date_time,
        block_total_bytes,
        block_total_bytes_compressed,
        block_root,
        execution_payload_block_number,
        execution_payload_transactions_count,
        execution_payload_transactions_total_bytes,
        execution_payload_transactions_total_bytes_compressed,
        meta_network_name
     FROM canonical_beacon_block
     WHERE slot_start_date_time > NOW() - INTERVAL '7 DAYS'
     AND meta_network_name IN ('mainnet', 'holesky')
    ) AS a
LEFT JOIN 
    (SELECT 
        event_date_time,
        slot,
        slot_start_date_time,
        epoch,
        block_root,
        blob_index,
        blob_size,
        meta_network_name
     FROM canonical_beacon_blob_sidecar
     WHERE slot_start_date_time > NOW() - INTERVAL '7 DAYS'
     AND meta_network_name IN ('mainnet', 'holesky')
    ) AS b
ON a.slot = b.slot
"""

In [13]:
# blob sidecar propagation
blob_propagation_slot: str = """
SELECT
    event_date_time,
    slot_start_date_time,
    propagation_slot_start_diff,
    slot,
    epoch,
    meta_network_name,
    meta_client_name
FROM beacon_api_eth_v1_events_blob_sidecar
WHERE slot_start_date_time > NOW() - INTERVAL '7 DAYS'
AND meta_network_name IN ('mainnet', 'holesky')
"""

In [14]:
# clickhouse queries
# block size
block_size_df = client.query_df(block_size_query)
block_size_df_pl: pl.DataFrame = pl.from_pandas(block_size_df).unique()

In [15]:
# blob propagation
blob_propagation_slot_query = client.query_df(blob_propagation_slot)
blob_propagation_df_pl: pl.DataFrame = pl.from_pandas(
    blob_propagation_slot_query
).unique()

In [16]:
# block blob size table
block_size_df_table = (
    block_size_df_pl.drop("event_date_time")
    .unique()
    .group_by("meta_network_name_block", "block_root_block")
    .agg(
        [
            pl.col("slot_start_date_time").first().alias("slot_start_date_time"),
            pl.col("block_total_bytes").first().alias("block_total_bytes"),
            # how do beacon blocks get compressed? Does the blob part get compressed as well?
            # https://github.com/ethereum/consensus-specs/blob/45b1026cb65dc7f5f45aa38cb89ed63294048484/specs/phase0/p2p-interface.md#compressionencoding
            pl.col("block_total_bytes_compressed")
            .first()
            .alias("block_total_bytes_compressed"),
            pl.col("execution_payload_transactions_total_bytes")
            .first()
            .alias("execution_payload_transactions_total_bytes"),
            pl.col("blob_index").max().alias("blob_count"),
            pl.col("blob_size").sum().alias("blob_size_sum"),
            pl.col("slot").first().alias("slot"),
            pl.col("epoch").first().alias("epoch"),
        ]
    )
)

In [17]:
# blob propagation table
blob_propagation_table = (
    pl.from_pandas(blob_propagation_slot_query)
    .unique()
    .group_by("slot")
    .agg(
        [
            pl.col("slot_start_date_time").first(),
            pl.col("meta_network_name").first(),
            pl.col("propagation_slot_start_diff")
            .mean()
            .round(2)
            .alias("mean_propagation_slot_start_diff"),
            pl.len().alias("count"),
            pl.col("epoch").first(),
        ]
    )
)

### Panel Charts

In [28]:
def block_size_chart(network: str, group_by_epoch: bool = True):
    """Create a line chart comparing compressed block size and uncompressed blob size.

    Args:
        network (str): The network name to filter data by.
        group_by_epoch (bool, optional): If True, group data by epoch for a cleaner chart. Defaults to True.
    """
    # Filter the dataframe for the specified network
    filtered_df = block_size_df_table.filter(
        pl.col("meta_network_name_block") == network
    )

    # Use a match-case statement to handle grouping
    match group_by_epoch:
        case True:
            return (
                filtered_df.group_by("epoch").agg(
                    [
                        pl.col("slot_start_date_time").first(),
                        (pl.col("block_total_bytes").mean() / 1000).alias("block"),
                        (pl.col("blob_size_sum").mean() / 1000).alias("blob"),
                    ]
                )
            ).plot.line(
                x="slot_start_date_time",
                y=["block", "blob"],
                title=f"Block vs Blob size ({network}, per epoch)",
                xlabel="Date",
                ylabel="Size (in kb)",
                line_width=2,
                alpha=0.8,
            )
        case False:
            return (
                filtered_df.with_columns(
                    [
                        (pl.col("block_total_bytes") / 1000).alias("block"),
                        (pl.col("blob_size_sum") / 1000).alias("blob"),
                    ]
                )
            ).plot.line(
                x="slot_start_date_time",
                y=["block", "blob"],
                title=f"Block vs Blob size ({network}, per epoch)",
                xlabel="Date",
                ylabel="Size (in kb)",
                line_width=2,
                alpha=0.8,
            )

In [29]:
def blob_propagation_chart(network: str, group_by_epoch: bool = True):
    """Create a line chart comparing mean propagation slot start difference by network.

    Args:
        network (str): The network name to filter data by.
        group_by_epoch (bool, optional): If True, group data by epoch for a cleaner chart. Defaults to True.
    """
    # Filter the dataframe for the specified network
    filtered_df = blob_propagation_table.filter(pl.col("meta_network_name") == network)

    match group_by_epoch:
        case False:
            return filtered_df.sort(by="slot_start_date_time").plot.line(
                x="slot_start_date_time",
                y="mean_propagation_slot_start_diff",
                title=f"Propagation Time {network}, per epoch)",
                xlabel="Date",
                ylabel="time (ms)",
                line_width=2,
                alpha=0.8,
            )
        case True:
            return (
                filtered_df.group_by("epoch")
                .agg(
                    [
                        pl.col("slot_start_date_time").first(),
                        pl.col("mean_propagation_slot_start_diff").mean(),
                    ]
                )
                .sort(by="slot_start_date_time")
            ).plot.line(
                x="slot_start_date_time",
                y="mean_propagation_slot_start_diff",
                title=f"Propagation Time ({network}, per epoch)",
                xlabel="Date",
                ylabel="time (ms)",
                line_width=2,
                alpha=0.8,
            )

### Panel Dashboard

In [30]:
block_size = pn.Row(
    block_size_chart("mainnet", group_by_epoch=True),
    block_size_chart("holesky", group_by_epoch=True),
)

holesky_blob_prop = pn.Row(
    blob_propagation_chart("holesky", group_by_epoch=True),
)

mainnet_blob_prop = pn.Row(
    blob_propagation_chart("mainnet", group_by_epoch=True),
)

In [31]:
combined_panel = pn.Column(
    pn.pane.Markdown(
        "# Blob Dashboard WIP - https://esp.ethereum.foundation/data-challenge-4844"
    ),
    block_size,
    mainnet_blob_prop,
)

# mainnet doesn't work because there is no data yet
mainnet_prop = pn.Column(
    # add title text
    pn.pane.Markdown(
        "Mainnet blob propagation data is not available yet. Check back later. Charts are average data over epoch"
    ),
    combined_panel,
    mainnet_blob_prop,
)

In [32]:
combined_panel.show()

Launching server at http://localhost:40285


<panel.io.server.Server at 0x7361b4af3110>

Gtk-Message: 23:01:59.355: Failed to load module "canberra-gtk-module"
Gtk-Message: 23:01:59.356: Failed to load module "canberra-gtk-module"


Opening in existing browser session.
