See readme.md for ideal fields and descriptions

In [1]:
# ! pip install pandas pyarrow
# ! pip install polars

In [2]:
import subprocess
import os
from datetime import datetime, timedelta, timezone
import time
import pyarrow.parquet as pq
import pandas as pd


In [3]:
import polars as pl

In [4]:
# Init timestamps
trailing_days = 1

current_date_utc = datetime.utcnow().date()
# Convert the current date to a Unix timestamp (remaining in UTC)
current_timestamp_utc = datetime(current_date_utc.year, current_date_utc.month, current_date_utc.day, tzinfo=timezone.utc).timestamp()
previous_date_utc = current_date_utc - timedelta(days=trailing_days)
previous_timestamp_utc = datetime(previous_date_utc.year, previous_date_utc.month, previous_date_utc.day, tzinfo=timezone.utc).timestamp()

print('end: ' + str(current_date_utc))
print('start: ' + str(previous_date_utc))

end: 2023-12-17
start: 2023-12-16


In [5]:
# Test doing t24h for Lyra

fields = 'blocks txs'# traces'
rpc_url = 'https://rpc.lyra.finance/'
chain_name = 'lyra'
start_timestamp = int(previous_timestamp_utc)
end_timestamp = int(current_timestamp_utc)

dry_run = 0
requests_per_second_max = -1 # -1 means ignore

In [6]:
# Generate Command
if dry_run == 1:
    dry_txt = '--dry'
else:
    dry_txt = ''
if requests_per_second_max > -1:
    rps_txt = 'requests-per-second ' + str(requests_per_second_max)
else:
    rps_txt = ''

command = f"cryo {fields} --rpc {rpc_url} --timestamps {start_timestamp}:{end_timestamp} --subdirs datatype --label {chain_name} {rps_txt} {dry_txt}"
print(command)

cryo blocks txs --rpc https://rpc.lyra.finance/ --timestamps 1702684800:1702771200 --subdirs datatype --label lyra  


In [7]:
start_time = time.time()
# Run the command using subprocess.run and capture the output
result = subprocess.run(
    command, 
    shell=True, 
    stdout=subprocess.PIPE,  # Capture standard output
    stderr=subprocess.PIPE,  # Capture standard error
    text=True  # Capture output as text (Python 3.7+)
)

# Display the captured output
if result.returncode == 0:
    print("Command succeeded. Output:")
    print(result.stdout)
# else:
#     print("Command failed. Error output:")
#     print(result.stderr)

end_time = time.time()

In [8]:
# Calculate the elapsed time
elapsed_time = end_time - start_time
# Print the elapsed time in seconds
print(f"Elapsed time: {elapsed_time:.4f} seconds")

Elapsed time: 93.2133 seconds


In [9]:
# # Read parquet files
txs = pl.scan_parquet('transactions__' + chain_name + '/*.parquet')
blocks = pl.scan_parquet('blocks__' + chain_name + '/*.parquet')

# Rename the 'gas_used' column to 'block_gas_used' in the 'blocks' DataFrame
blocks = blocks.rename({"gas_used": "block_gas_used"})

# Perform the join on 'block_number' and 'chain_id'
joined_df = blocks.join(
    txs,
    on=["block_number", "chain_id"],
    how="inner"  # You can specify the type of join you want (inner, outer, left, right)
)

# Convert Unix timestamp to datetime and create a new column 'timestamp_dt'
joined_df = joined_df.with_columns(
    pl.from_epoch("timestamp", time_unit="s").alias("timestamp_dt")
)

# Truncate the 'timestamp_dt' column to the day and create a new column 'timestamp_date'
joined_df = joined_df.with_columns(
    pl.col("timestamp_dt").dt.truncate("1d").alias("timestamp_date")
)

In [10]:
# print(blocks.schema)
# print(txs.schema)
print(joined_df.schema)

#test output
joined_pd = joined_df.collect().to_pandas()
joined_pd.tail(5)

print('num blocks: ' + str(joined_pd['block_number'].nunique()))

OrderedDict([('block_hash', Binary), ('author', Binary), ('block_number', UInt32), ('block_gas_used', UInt64), ('extra_data', Binary), ('timestamp', UInt32), ('base_fee_per_gas', UInt64), ('chain_id', UInt64), ('transaction_index', UInt64), ('transaction_hash', Binary), ('nonce', UInt64), ('from_address', Binary), ('to_address', Binary), ('value_binary', Binary), ('value_string', Utf8), ('value_f64', Float64), ('input', Binary), ('gas_limit', UInt64), ('gas_used', UInt64), ('gas_price', UInt64), ('transaction_type', UInt32), ('max_priority_fee_per_gas', UInt64), ('max_fee_per_gas', UInt64), ('success', Boolean), ('timestamp_dt', Datetime(time_unit='us', time_zone=None)), ('timestamp_date', Datetime(time_unit='us', time_zone=None))])
num blocks: 26201


In [17]:
# Create a reference list of consecutive block numbers
reference_block_numbers = list(range(joined_pd['block_number'].min(), joined_pd['block_number'].max() + 1))

# Find the missing block numbers
missing_block_numbers = set(reference_block_numbers) - set(joined_pd['block_number'])

print("Missing block numbers:", missing_block_numbers)

# Convert the missing_block_numbers set to a list and sort it
missing_block_numbers_list = sorted(list(missing_block_numbers))
is_consecutive = all(missing_block_numbers_list[i] == missing_block_numbers_list[i - 1] + 1 for i in range(1, len(missing_block_numbers_list)))

if is_consecutive:
    print("The missing block numbers are consecutive.")
else:
    print("The missing block numbers are not consecutive.")

Missing block numbers: {1351592, 1351593, 1351594, 1351595, 1351596, 1351597, 1351598, 1351599, 1351600, 1351601, 1351602, 1351603, 1351604, 1351605, 1351606, 1351607, 1351608, 1351609, 1351610, 1351611, 1351612, 1351613, 1351614, 1351615, 1351616, 1351617, 1351618, 1351619, 1351620, 1351621, 1351622, 1351623, 1351624, 1351625, 1351626, 1351627, 1351628, 1351629, 1351630, 1351631, 1351632, 1351633, 1351634, 1351635, 1351636, 1351637, 1351638, 1351639, 1351640, 1351641, 1351642, 1351643, 1351644, 1351645, 1351646, 1351647, 1351648, 1351649, 1351650, 1351651, 1351652, 1351653, 1351654, 1351655, 1351656, 1351657, 1351658, 1351659, 1351660, 1351661, 1351662, 1351663, 1351664, 1351665, 1351666, 1351667, 1351668, 1351669, 1351670, 1351671, 1351672, 1351673, 1351674, 1351675, 1351676, 1351677, 1351678, 1351679, 1351680, 1351681, 1351682, 1351683, 1351684, 1351685, 1351686, 1351687, 1351688, 1351689, 1351690, 1351691, 1351692, 1351693, 1351694, 1351695, 1351696, 1351697, 1351698, 1351699, 1351

TypeError: 'set' object is not subscriptable

In [12]:
# Assuming you have a DataFrame named 'joined_df' with the required columns

result_df = joined_df.group_by([pl.col("timestamp_date"), pl.col("chain_id")]).agg(
    num_blocks=pl.col("block_number").n_unique(),
    min_block_number=pl.col("block_number").min(),
    max_block_number=pl.col("block_number").max(),
    min_block_time=pl.col("timestamp").min(),
    max_block_time=pl.col("timestamp").max(),

    num_user_transactions=
        pl.when(pl.col("gas_price") > 0).then(pl.col("transaction_hash")).count(),
    num_success_user_transactions=
        pl.when((pl.col("gas_price") > 0) & pl.col("success")).then(pl.col("transaction_hash")).count(),
    num_senders=pl.col("from_address").filter(pl.col("gas_price") > 0).n_unique(),

    total_gas_used=pl.col("gas_used").sum(),
    user_gas_used=pl.col("gas_used").filter(pl.col("gas_price") > 0).sum(),
    
    l2_fees_base_fees_eth=(pl.col("base_fee_per_gas") * pl.col("gas_used")).sum() / 1e18,
    l2_fees_priority_fees_eth=pl.when(pl.col("gas_price") > 0).then((pl.col("gas_price") - pl.col("base_fee_per_gas")) * pl.col("gas_used")).sum() / 1e18,
    l2_fees_total_fees_eth=(pl.col("gas_price") * pl.col("gas_used")).sum() / 1e18,
)
result_df

In [13]:
#Execute and turn to pandas
result_df = result_df.collect().to_pandas()


In [14]:
# Filter
result_df = result_df[result_df['min_block_time']>= start_timestamp] #seems like 1 block before gets pulled. yolo.

In [15]:
result_df['min_block_time_dt'] = pd.to_datetime(result_df['min_block_time'], unit='s')
result_df['max_block_time_dt'] = pd.to_datetime(result_df['max_block_time'], unit='s')
display(result_df.sort_values(by='timestamp_date',ascending=False))

Unnamed: 0,timestamp_date,chain_id,num_blocks,min_block_number,max_block_number,min_block_time,max_block_time,num_user_transactions,num_success_user_transactions,num_senders,total_gas_used,user_gas_used,l2_fees_base_fees_eth,l2_fees_priority_fees_eth,l2_fees_total_fees_eth,min_block_time_dt,max_block_time_dt
0,2023-12-16,957,26200,1331593,1374792,1702684801,1702771199,71,71,12.0,1398972149,84850397,6.994861e-08,0.001432,0.001432,2023-12-16 00:00:01,2023-12-16 23:59:59
