In [1]:
import polars as pl
import os

pl.Config.set_fmt_str_lengths(200)

polars.config.Config

In [3]:
# load data in data/univ3 folder
data = pl.read_parquet('data/univ3/*.parquet')

In [7]:
# count number of blockchain values
data['blockNumber'].n_unique()

12624485

In [None]:
# filter for buggy transaction USD values
data = data.filter(pl.col('amountInUSD') < 150000000)

In [6]:
data.estimated_size('mb') # amount of memory the file takes up

9228.19331073761

In [10]:
# number of swaps across blockchains
blockchain_count = data.groupby('blockchain').count()

blockchain,count
str,u32
"""optimism""",5773664
"""ethereum""",4816330
"""polygon""",6032242
"""arbitrum""",11521040


In [None]:
# sum OHM volume in USD grouped by dex
mev_swap_vol_usd = data.groupby('dex').agg(pl.col('amountInUSD').sum())

# merge ohm_count and mev_swap_vol_usd
mev_swap_agg = mev_swap_count.join(mev_swap_vol_usd, on='dex')

# select labeled columns and rename
mev_swap_agg = mev_swap_agg.select(
    pl.col('dex').alias('dex'),
    pl.col('count').alias('count_labeled'),
    pl.col('amountInUSD').alias('sum_amountInUSD_labeled')
)

### Token Filtering

In [None]:
def filter_tokens(token_symbol: str, dex_df: pl.DataFrame) -> pl.DataFrame:
    """
    Filter dex swaps for specific token swaps
    """
    # filter dex_df tokenIn_symbol for token_symbol
    tokenIn_df = dex_df.filter(pl.col('tokenIn_symbol') == token_symbol)

    # filter dex_df tokenOut_symbol for token_symbol
    tokenOut_df = dex_df.filter(pl.col('tokenOut_symbol') == token_symbol)

    # concat tokenIn_df and tokenOut_df
    filter_df = pl.concat([tokenIn_df, tokenOut_df])

    return filter_df