In [1]:
import polars as pl
import os

In [2]:
# Get the absolute path of the current working directory
cwd = os.path.abspath(os.getcwd())

# Get the absolute path of the parent directory of the working directory
parent_dir = os.path.abspath(os.path.join(cwd, os.pardir))

# Navigate to the target directory (assuming it is two levels above the working directory)
target_dir = os.path.abspath(os.path.join(parent_dir, os.pardir, 'data'))

In [3]:
# read every parquet file in the data directory, load them all, and then concat together except uniswap-v3 mainnet
# concat all of the balancer-v2 parquet files
balancer_df = pl.concat([pl.read_parquet(f"{target_dir}/{f}") for f in os.listdir(target_dir) if 'balancer-v2' in f])

curve_df = pl.concat([pl.read_parquet(f"{target_dir}/{f}") for f in os.listdir(target_dir) if 'curve-finance' in f])

In [4]:
# concat balancer and curve
swaps_df = pl.concat([balancer_df, curve_df])

In [5]:
master_swaps_col_list = [
'swaps_hash', 
'swaps_from', 
'swaps_blockNumber',  
'swaps_timestamp',
'swaps_tokenIn_id',
'swaps_amountIn',
'swaps_amountInUSD',
'swaps_tokenOut_id',
'swaps_amountOut',
'swaps_amountOutUSD',
'swaps_pool_id',
'endpoint'
]

In [6]:
# truncate swaps_df based on master list
swaps_df = swaps_df[master_swaps_col_list]

The Messari Uniswap v3 subgraph data has different swap schema columns. The polygon swap schema has the same columns as Balancer and Curve. 
However, the ethereum, arbitrum, and optimism swap schemas have different columns. The most glaring difference is that while Balancer and Curve
swap schemas have swaps_to and swaps_from, the non-comforming Uniswap v3 schemas only have swaps_account_id, which is equivalent to the swaps_from column.

In [7]:
# load each parquet file separately for uniswap-v3. 
df1 = pl.concat([pl.read_parquet(f"{target_dir}/{f}") for f in os.listdir(target_dir) if 'uniswap-v3-arbitrum' in f])
df2 = pl.concat([pl.read_parquet(f"{target_dir}/{f}") for f in os.listdir(target_dir) if 'uniswap-v3-ethereum' in f])
df3 = pl.concat([pl.read_parquet(f"{target_dir}/{f}") for f in os.listdir(target_dir) if 'uniswap-v3-polygon' in f])
df4 = pl.concat([pl.read_parquet(f"{target_dir}/{f}") for f in os.listdir(target_dir) if 'uniswap-v3-optimism' in f])

In [8]:
df1 = df1.rename({'swaps_account_id': 'swaps_from'})
df2 = df2.rename({'swaps_account_id': 'swaps_from'})
df4 = df4.rename({'swaps_account_id': 'swaps_from'})

In [9]:
# truncate df1, df2, df3, df4 based on the master list
df1 = df1[master_swaps_col_list]
df2 = df2[master_swaps_col_list]
df3 = df3[master_swaps_col_list]
df4 = df4[master_swaps_col_list]

In [10]:
#concat df1, df2, df3, df4
uniswap_df = pl.concat([df1, df2, df3, df4])

In [11]:
# concat swaps_df and uniswap_df
swaps_df = pl.concat([swaps_df, uniswap_df])

In [12]:
swaps_df = swaps_df.with_columns(
    pl.from_epoch("swaps_timestamp", unit="s")
)

In [13]:
swaps_df.shape

(1958328, 12)

In [14]:
# this is the dataset we will work with for analysis
swaps_df.head(5)

swaps_hash,swaps_from,swaps_blockNumber,swaps_timestamp,swaps_tokenIn_id,swaps_amountIn,swaps_amountInUSD,swaps_tokenOut_id,swaps_amountOut,swaps_amountOutUSD,swaps_pool_id,endpoint
str,str,i64,datetime[μs],str,f64,f64,str,f64,f64,str,str
"""0xa60f173d822a...","""0xf7995b6b0511...",16816549,2023-03-13 03:59:59,"""0x6b175474e890...",2.9991e+22,28643.363471,"""0xae37d54ae477...",2.9918e+22,0.0,"""0xae37d54ae477...","""balancer-v2-et..."
"""0x59d3ff71f65a...","""0xf7995b6b0511...",16816533,2023-03-13 03:56:47,"""0xa0b86991c621...",19994000000.0,19947.7118,"""0x82698aecc9e2...",1.9934e+22,0.0,"""0x82698aecc9e2...","""balancer-v2-et..."
"""0x06cff3367d7c...","""0x1b84c738c33c...",16816532,2023-03-13 03:56:35,"""0xc02aaa39b223...",1.7152e+20,253851.18999,"""0x7f39c581f595...",1.542e+20,276727.804267,"""0x32296969ef14...","""balancer-v2-et..."
"""0x09eff2771522...","""0x1b84c738c33c...",16816531,2023-03-13 03:56:23,"""0xba100000625a...",4.8415e+19,312.568058,"""0xc02aaa39b223...",1.9273e+17,285.239959,"""0x3ebf48cd7586...","""balancer-v2-et..."
"""0x59aee0705b39...","""0x9b88a69886d0...",16816525,2023-03-13 03:55:11,"""0xdac17f958d2e...",75000000000.0,78013.8,"""0x2f4eb100552e...",7.436e+22,0.0,"""0x2f4eb100552e...","""balancer-v2-et..."


Since we only have the swaps_from values, we will just focus on amountInUSD column for analysis. Here we will just drop all of the nonzero amountInUSD values for simplicity. It looks like it's about 11.27% of total datapoints

In [15]:
amountInZeroCount = swaps_df.filter(pl.col('swaps_amountInUSD') == 0).shape[0]
amountOutZeroCount = swaps_df.filter(pl.col('swaps_amountOutUSD') == 0).shape[0]
swaps_df_size = swaps_df.shape[0]

print(f'amountInZeroCount: {amountInZeroCount}')
print(f'amountOutZeroCount: {amountOutZeroCount}')
print(f'swaps_df size: {swaps_df_size}')

amountInZeroCount: 220858
amountOutZeroCount: 250749
swaps_df size: 1958328


In [16]:
# how many swaps_amountInUSD values are 0?
amountInUSDPercent = swaps_df.filter(pl.col('swaps_amountInUSD') == 0).shape[0] / swaps_df.shape[0]
print(f'The percent of swaps_amountInUSD values that are 0 is {amountInUSDPercent}')

The percent of swaps_amountInUSD values that are 0 is 0.1127788603339175


In [18]:
# save swaps_df to parquet
swaps_df.write_parquet(f"{target_dir}/swaps_df.parquet")