In [1]:
import pandas as pd
import numpy as np

# DEX data preprocessing

In [2]:
# dex data load
dex_eth_swaps = pd.read_csv("uniswap_v3_swaps_ETH_USDC_005.csv")
dex_pepe_swaps = pd.read_csv("uniswap_v3_swaps_PEPE_ETH_03.csv")
dex_usdt_swaps = pd.read_csv("uniswap_v3_swaps_USDC_USDT_001.csv")

In [3]:
dex_eth_swaps.head()

Unnamed: 0,amount0,amount1,amountUSD,id,sqrtPriceX96,tick,timestamp
0,9085.964025,-3.04325,9078.584902,0x9c77ea47fa2468406e1f46a11eabbf22c76befedd4d7...,1450328374817326622785287097948453,196309,1714514447
1,-790.379136,0.265,790.140621,0xea2375e7cec161003ea4b672ae5d818f82b227e529a4...,1450362739124707994226887478035384,196309,1714514447
2,-1977.359867,0.662947,1976.723465,0x4fd2c4bd76d5c34ce8c2204c09694d24a58a06738014...,1450335857054142188306263863701813,196309,1714514459
3,999.395828,-0.334732,998.5762,0x1a2ee88c75fd18372757ccb3d3af3f7c118b12da435c...,1450332077268479479461493629261257,196309,1714514483
4,3003.197644,-1.005864,3000.719004,0x31574290baf7b22dc866c4c9c7247fd9cdb5adc2d74b...,1450320719081264830680607229823237,196309,1714514483


In [4]:
# function to calculate price from sqrtPriceX96
# returns price of token1 in token0
from decimal import Decimal
def sqrtPriceX96_to_price(sqrt_price_x96: int, decimals_token0: int, decimals_token1: int) -> float:
    sqrt_price_decimal = Decimal(sqrt_price_x96) / (1 << 96)
    price = sqrt_price_decimal ** 2
    return float((1 / price) * (Decimal(10) ** (decimals_token1 - decimals_token0)))

In [5]:
# declaration of decimals
decimals_eth = 18
decimals_usdc = 6
decimals_pepe = 18
decimals_usdt = 6

In [6]:
# create normal prices 

# how many usdc does 1 eth cost
dex_eth_swaps["price"] = dex_eth_swaps["sqrtPriceX96"].apply(lambda x: sqrtPriceX96_to_price(x, decimals_usdc, decimals_eth))
# how many pepe does 1 eth cost
dex_pepe_swaps["price"] = dex_pepe_swaps["sqrtPriceX96"].apply(lambda x: sqrtPriceX96_to_price(x, decimals_pepe, decimals_eth))
# how many usdc does 1 usdt cost
dex_usdt_swaps["price"] = dex_usdt_swaps["sqrtPriceX96"].apply(lambda x: sqrtPriceX96_to_price(x, decimals_usdc, decimals_usdt))

In [7]:
# convert timestamp to normal date (UTC)
dex_eth_swaps["timestamp_norm"] = pd.to_datetime(dex_eth_swaps["timestamp"], unit = "s")
dex_pepe_swaps["timestamp_norm"] = pd.to_datetime(dex_pepe_swaps["timestamp"], unit = "s")
dex_usdt_swaps["timestamp_norm"] = pd.to_datetime(dex_usdt_swaps["timestamp"], unit = "s")

In [9]:
# volume in eth
dex_eth_swaps["volume"] = dex_eth_swaps["amount1"].abs()
# volume in pepe
dex_pepe_swaps["volume"] = dex_pepe_swaps["amount0"].abs()
# volume in usdc
dex_usdt_swaps["volume"] = dex_usdt_swaps["amount0"].abs()

In [10]:
# save only necessary columns
dex_eth = dex_eth_swaps[["timestamp_norm", "price", "volume"]]
dex_pepe = dex_pepe_swaps[["timestamp_norm", "price", "volume"]]
dex_usdt = dex_usdt_swaps[["timestamp_norm", "price", "volume"]]

In [11]:
dex_eth = dex_eth.rename(columns={"timestamp_norm":"timestamp"})
dex_pepe = dex_pepe.rename(columns={"timestamp_norm":"timestamp"})
dex_usdt = dex_usdt.rename(columns={"timestamp_norm":"timestamp"})

In [12]:
dex_eth

Unnamed: 0,timestamp,price,volume
0,2024-04-30 22:00:47,2984.190063,3.043250
1,2024-04-30 22:00:47,2984.048653,0.265000
2,2024-04-30 22:00:59,2984.159273,0.662947
3,2024-04-30 22:01:23,2984.174827,0.334732
4,2024-04-30 22:01:23,2984.221568,1.005864
...,...,...,...
2286277,2025-04-30 21:58:47,1792.540918,51.245525
2286278,2025-04-30 21:58:59,1792.272124,16.745647
2286279,2025-04-30 21:59:23,1792.271945,0.011165
2286280,2025-04-30 21:59:35,1792.271897,0.002942


In [16]:
# add pair tag for simplicity
dex_eth["pair"] = "ETH/USDC"
dex_pepe["pair"] = "PEPE/ETH"
dex_usdt["pair"] = "USDT/USDC"

In [17]:
# group data, extract low, high, open, close and fill missing values with last available swap price
def group_data(df_orig):
    df = df_orig.copy()
    df.set_index("timestamp", inplace = True)
    df_ohlc = df["price"].resample("1min").ohlc()
    vol = df["volume"].resample("1min").sum()
    df_ohlc["volume"] = vol
    # take the last available close and pass it forward
    df_ohlc["close"] = df_ohlc["close"].ffill()
    # to make data continuous: take previous close as further open
    df_ohlc["open"] = df_ohlc["close"].shift(1)
    df_ohlc["open"] = df_ohlc["open"].fillna(df_ohlc["close"])
    df_ohlc["high"] = df_ohlc[["open", "close", "high", "low"]].max(axis=1)
    df_ohlc["low"] = df_ohlc[["open", "close", "high", "low"]].min(axis=1)
    df_ohlc = df_ohlc.reset_index()
    return df_ohlc



In [18]:
# transform uniswap data to ohlc data
dex_eth_ohlc = group_data(dex_eth)
dex_pepe_ohlc = group_data(dex_pepe)
dex_usdt_ohlc = group_data(dex_usdt)

In [19]:
dex_eth_ohlc

Unnamed: 0,timestamp,open,high,low,close,volume
0,2024-04-30 22:00:00,2984.159273,2984.190063,2984.048653,2984.159273,3.971197
1,2024-04-30 22:01:00,2984.159273,2986.640644,2984.159273,2985.328841,81.584859
2,2024-04-30 22:02:00,2985.328841,2988.374860,2985.328841,2988.357893,66.262816
3,2024-04-30 22:03:00,2988.357893,2988.357893,2988.333778,2988.333778,16.361038
4,2024-04-30 22:04:00,2988.333778,2988.333778,2988.302089,2988.319292,0.680855
...,...,...,...,...,...,...
525595,2025-04-30 21:55:00,1796.065419,1796.065419,1794.406660,1794.406660,80.272641
525596,2025-04-30 21:56:00,1794.406660,1794.417050,1794.351817,1794.417050,5.812522
525597,2025-04-30 21:57:00,1794.417050,1794.420906,1794.334799,1794.334799,4.370486
525598,2025-04-30 21:58:00,1794.334799,1794.334799,1792.272124,1792.272124,119.610989


In [20]:
dex_eth_ohlc.to_csv("ETH_USDC_1m_ohlc.csv", index=False)
dex_pepe_ohlc.to_csv("PEPE_ETH_1m_ohlc.csv", index=False)
dex_usdt_ohlc.to_csv("USDT_USDC_1m_ohlc.csv", index=False)