In [1]:
import clickhouse_connect
import os
import panel as pn
import polars as pl

pn.extension("plotly", template="material", sizing_mode="stretch_width")
pl.Config.set_fmt_str_lengths(200)
pl.Config.set_fmt_float("full")

polars.config.Config

In [2]:
# Clickhouse Format Settings - https://clickhouse.com/docs/en/integrations/python#read-formats
from clickhouse_connect.datatypes.format import set_read_format

# Return both IPv6 and IPv4 values as strings
set_read_format("IPv*", "string")

# Return binary as string
set_read_format("FixedString", "string")

# sets large ints to floats so that there are no large int overflow errors when converting to polars dataframe
set_read_format("Int*", "float")

### Clickhouse Queries

In [3]:
# Create ClickHouse client
client = clickhouse_connect.get_client(
    host=os.environ.get("HOST"),
    username=os.environ.get("USERNAME"),
    password=os.environ.get("PASSWORD"),
    secure=True,
)

In [4]:
# mempool txs
mempool_query = """ 
SELECT 
    MIN(event_date_time) AS earliest_event_date_time,
    type,
    blob_sidecars_size,
    blob_sidecars_empty_size,
    hash,
    blob_hashes,
    nonce,
    meta_network_name,
    length(blob_hashes) as blob_hashes_length,
    ROUND(100 - (blob_sidecars_empty_size / blob_sidecars_size) * 100, 2) AS fill_percentage,
    from,
    to
FROM mempool_transaction 
WHERE event_date_time > NOW() - INTERVAL '7 DAYS'
AND type = 3 AND meta_network_name = 'mainnet'
GROUP BY hash, type, blob_sidecars_size, blob_sidecars_empty_size, blob_hashes, nonce, meta_network_name, blob_hashes_length, fill_percentage, to, from
"""
mempool_df = client.query_df(mempool_query)

In [5]:
mempool_df_pl = pl.from_pandas(mempool_df)

In [6]:
# of blob hashes
mempool_df_pl.select("blob_hashes_length").sum()

blob_hashes_length
u64
17783


In [7]:
mempool_df_pl.sort(by="blob_sidecars_size").head(5)

earliest_event_date_time,type,blob_sidecars_size,blob_sidecars_empty_size,hash,blob_hashes,nonce,meta_network_name,blob_hashes_length,fill_percentage,from,to
datetime[ms],u8,u32,u32,str,list[str],u64,str,u64,f64,str,str
2024-03-14 17:32:11.415,3,131072,0,"""0x004ac1bb3700dc99de497b58772274ce7a984783262f2c12ed59b33cd109614d""","[""0x0107778a745713908134c9ff0270d6e59803cf4e37e4ce35126e3ff64d2c30eb""]",614895,"""mainnet""",1,100,"""0x2C169DFe5fBbA12957Bdd0Ba47d9CEDbFE260CA7""","""0xc662c410C0ECf747543f5bA90660f6ABeBD9C8c4"""
2024-03-14 19:32:41.755,3,131072,0,"""0x18502fe8565420b9cd176ffd222bac3279b4d718d228a28ce1b9ab270bae3dca""","[""0x016651e1194d89d64243201a8b19a43d983deb3a7e2e6981370cedb1d28c3bff""]",615779,"""mainnet""",1,100,"""0x2C169DFe5fBbA12957Bdd0Ba47d9CEDbFE260CA7""","""0xc662c410C0ECf747543f5bA90660f6ABeBD9C8c4"""
2024-03-14 22:47:56.085,3,131072,0,"""0x06420b09a0de78541eeb730cf33600bf3129733419f6fe4a79ca1a7887e395f5""","[""0x0140970d5ea35e38c934cd0daf0fe9a09a4a04c56440391196fc21e8484fe2de""]",2246,"""mainnet""",1,100,"""0x0D3250c3D5FAcb74Ac15834096397a3Ef790ec99""","""0xa8CB082A5a689E0d594d7da1E2d72A3D63aDc1bD"""
2024-03-14 15:15:15.805,3,131072,0,"""0xd6378fe334bbdde3a4eb297948f90c2dace72b637f24a6ae1156c510ddbe63e5""","[""0x01827b14616512c3e06054f047cb4b6a77ce3c546a99987fb9c6374bb362e64d""]",613808,"""mainnet""",1,100,"""0x2C169DFe5fBbA12957Bdd0Ba47d9CEDbFE260CA7""","""0xc662c410C0ECf747543f5bA90660f6ABeBD9C8c4"""
2024-03-14 15:19:33.212,3,131072,0,"""0x3b0f64d86b7c9cda1bb5b0578b48f776a4ffba64553788ca88b587ba8bdd4dae""","[""0x01ba1b41431d951287d46917e09de4f1b56cb1de984e41eeca398cb2ba86e732""]",613847,"""mainnet""",1,100,"""0x2C169DFe5fBbA12957Bdd0Ba47d9CEDbFE260CA7""","""0xc662c410C0ECf747543f5bA90660f6ABeBD9C8c4"""


In [8]:
beacon_execution_query = """
SELECT 
slot_start_date_time,
    slot,
    hash,
    blob_hashes,
    length(blob_hashes) as blob_hashes_length,
    type
FROM canonical_beacon_block_execution_transaction 
WHERE event_date_time > NOW() - INTERVAL '7 DAYS'
AND type = 3 AND meta_network_name = 'mainnet'
ORDER BY slot DESC
"""
beacon_df_df = client.query_df(beacon_execution_query)

In [9]:
beacon_df_df_pl = pl.from_pandas(beacon_df_df)

In [10]:
# combine beacon_df_df_pl with mempool_df_pl on blob_hashes
mempool_beacon_df = beacon_df_df_pl.join(
    mempool_df_pl.select(
        "hash",
        "earliest_event_date_time",
        "fill_percentage",
        "nonce",
        "meta_network_name",
        "from",
        "to",
    ),
    on="hash",
    how="inner",
    suffix="_mempool",
)

In [11]:
# mempool_beacon_time_diff takes the difference from when the slot time starts and when the transaction was first seen in the mempool.
# A positive number means that the tx was in the mempool before the slot started. A negative number means that the tx was picked up
# after the slot time started.
blob_censorship = (
    mempool_beacon_df.with_columns(
        [
            (
                (pl.col("slot_start_date_time") -
                 pl.col("earliest_event_date_time"))
            ).alias("mempool_beacon_time_diff")
        ]
    )
    .with_columns(
        [
            (pl.col("mempool_beacon_time_diff").dt.total_seconds() / 12)
            .ceil()
            .alias("blocks_from_mempool_to_beacon")
        ]
    )
    .sort(by="mempool_beacon_time_diff", descending=False)
    .plot.line(
        x="slot_start_date_time",
        y="blocks_from_mempool_to_beacon",
        title="Blob Censorship",
        xlabel="Date",
        ylabel="# of blocks before blob is confirmed",
    )
)

In [12]:
# block + blob size
block_size_query = """
SELECT 
    a.event_date_time,
    a.slot,
    a.slot_start_date_time,
    a.epoch,
    a.epoch_start_date_time,
    a.block_total_bytes,
    a.block_total_bytes_compressed,
    a.block_root AS block_root_block,
    a.execution_payload_block_number,
    a.execution_payload_transactions_count,
    a.execution_payload_transactions_total_bytes,
    a.execution_payload_transactions_total_bytes_compressed,
    b.blob_index,
    b.blob_size,
    a.meta_network_name AS meta_network_name_block,
    b.meta_network_name AS meta_network_name_blob
FROM 
    (SELECT 
        event_date_time,
        slot,
        slot_start_date_time,
        epoch,
        epoch_start_date_time,
        block_total_bytes,
        block_total_bytes_compressed,
        block_root,
        execution_payload_block_number,
        execution_payload_transactions_count,
        execution_payload_transactions_total_bytes,
        execution_payload_transactions_total_bytes_compressed,
        meta_network_name
     FROM canonical_beacon_block
     WHERE slot_start_date_time > NOW() - INTERVAL '7 DAYS'
     AND meta_network_name IN ('mainnet', 'holesky')
    ) AS a
LEFT JOIN 
    (SELECT 
        event_date_time,
        slot,
        slot_start_date_time,
        epoch,
        block_root,
        blob_index,
        blob_size,
        meta_network_name
     FROM canonical_beacon_blob_sidecar
     WHERE slot_start_date_time > NOW() - INTERVAL '7 DAYS'
     AND meta_network_name IN ('mainnet', 'holesky')
    ) AS b
ON a.slot = b.slot
"""

In [13]:
# blob sidecar propagation
blob_propagation_slot: str = """
SELECT
    event_date_time,
    slot_start_date_time,
    propagation_slot_start_diff,
    slot,
    epoch,
    meta_network_name,
    meta_client_name
FROM beacon_api_eth_v1_events_blob_sidecar
WHERE slot_start_date_time > NOW() - INTERVAL '7 DAYS'
AND meta_network_name IN ('mainnet', 'holesky')
"""

In [14]:
# clickhouse queries
# block size
block_size_df = client.query_df(block_size_query)
block_size_df_pl: pl.DataFrame = pl.from_pandas(block_size_df).unique()

In [15]:
# blob propagation
blob_propagation_slot_query = client.query_df(blob_propagation_slot)
blob_propagation_df_pl: pl.DataFrame = pl.from_pandas(
    blob_propagation_slot_query
).unique()

In [16]:
# block blob size table
block_size_df_table = (
    block_size_df_pl.drop("event_date_time")
    .unique()
    .group_by("meta_network_name_block", "block_root_block")
    .agg(
        [
            pl.col("slot_start_date_time").first().alias(
                "slot_start_date_time"),
            pl.col("block_total_bytes").first().alias("block_total_bytes"),
            # how do beacon blocks get compressed? Does the blob part get compressed as well?
            # https://github.com/ethereum/consensus-specs/blob/45b1026cb65dc7f5f45aa38cb89ed63294048484/specs/phase0/p2p-interface.md#compressionencoding
            pl.col("block_total_bytes_compressed")
            .first()
            .alias("block_total_bytes_compressed"),
            pl.col("execution_payload_transactions_total_bytes")
            .first()
            .alias("execution_payload_transactions_total_bytes"),
            pl.col("blob_index").max().alias("blob_count"),
            pl.col("blob_size").sum().alias("blob_size_sum"),
            pl.col("slot").first().alias("slot"),
            pl.col("epoch").first().alias("epoch"),
        ]
    )
)

In [17]:
# blob propagation table
blob_propagation_table = (
    pl.from_pandas(blob_propagation_slot_query)
    .unique()
    .group_by("slot")
    .agg(
        [
            pl.col("slot_start_date_time").first(),
            pl.col("meta_network_name").first(),
            pl.col("propagation_slot_start_diff")
            .mean()
            .round(2)
            .alias("mean_propagation_slot_start_diff"),
            pl.len().alias("count"),
            pl.col("epoch").first(),
        ]
    )
)

### Panel Charts

In [18]:
def block_size_chart(network: str, group_by_epoch: bool = True):
    """Create a line chart comparing compressed block size and uncompressed blob size.

    Args:
        network (str): The network name to filter data by.
        group_by_epoch (bool, optional): If True, group data by epoch for a cleaner chart. Defaults to True.
    """
    # Filter the dataframe for the specified network
    filtered_df = block_size_df_table.filter(
        pl.col("meta_network_name_block") == network
    )

    # Use a match-case statement to handle grouping
    match group_by_epoch:
        case True:
            return (
                filtered_df.group_by("epoch").agg(
                    [
                        pl.col("slot_start_date_time").first(),
                        (pl.col("block_total_bytes_compressed").mean() / 1000).alias(
                            "block"
                        ),
                        (pl.col("blob_size_sum").mean() / 1000).alias("blob"),
                    ]
                )
            ).plot.line(
                x="slot_start_date_time",
                y=["block", "blob"],
                title=f"(Compressed) Block vs Blob size ({network}, per epoch)",
                xlabel="Date",
                ylabel="Size (in kb)",
                line_width=2,
                alpha=0.8,
            )
        case False:
            return (
                filtered_df.with_columns(
                    [
                        (pl.col("block_total_bytes_compressed") / 1000).alias("block"),
                        (pl.col("blob_size_sum") / 1000).alias("blob"),
                    ]
                )
            ).plot.line(
                x="slot_start_date_time",
                y=["block", "blob"],
                title=f"(Compressed) Block vs Blob size ({network}, per epoch)",
                xlabel="Date",
                ylabel="Size (in kb)",
                line_width=2,
                alpha=0.8,
            )

In [19]:
def blob_propagation_chart(network: str, group_by_epoch: bool = True):
    """Create a line chart comparing mean propagation slot start difference by network.

    Args:
        network (str): The network name to filter data by.
        group_by_epoch (bool, optional): If True, group data by epoch for a cleaner chart. Defaults to True.
    """
    # Filter the dataframe for the specified network
    filtered_df = blob_propagation_table.filter(
        pl.col("meta_network_name") == network)

    match group_by_epoch:
        case False:
            return filtered_df.sort(by="slot_start_date_time").plot.line(
                x="slot_start_date_time",
                y="mean_propagation_slot_start_diff",
                title=f"Blob Sidecar Propagation Time ({network}, per epoch)",
                xlabel="Date",
                ylabel="time (ms)",
                line_width=2,
                alpha=0.8,
            )
        case True:
            return (
                filtered_df.group_by("epoch")
                .agg(
                    [
                        pl.col("slot_start_date_time").first(),
                        pl.col("mean_propagation_slot_start_diff").mean(),
                    ]
                )
                .sort(by="slot_start_date_time")
            ).plot.line(
                x="slot_start_date_time",
                y="mean_propagation_slot_start_diff",
                title=f"Blob Sidecar Propagation Time ({network}, per epoch)",
                xlabel="Date",
                ylabel="time (ms)",
                line_width=2,
                alpha=0.8,
            )

### Panel Dashboard

In [20]:
block_size = pn.Row(
    block_size_chart("mainnet", group_by_epoch=True),
    blob_propagation_chart("mainnet", group_by_epoch=True),
)

In [21]:
combined_panel = pn.Column(
    pn.pane.Markdown(
        "# Blob Dashboard WIP - https://esp.ethereum.foundation/data-challenge-4844"
    ),
    block_size,
    blob_censorship,
)

In [22]:
combined_panel.show()

Launching server at http://localhost:35697


<panel.io.server.Server at 0x7d39fc14dc50>

Gtk-Message: 11:23:40.198: Failed to load module "canberra-gtk-module"
Gtk-Message: 11:23:40.199: Failed to load module "canberra-gtk-module"


Opening in existing browser session.
