In [None]:
import json
import logging
import os
import time
import traceback
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from decimal import Decimal
from threading import Lock
import duckdb
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from requests.exceptions import HTTPError
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)
from web3 import Web3
from web3.exceptions import (
    Web3RPCError,
    TransactionNotFound,
    BlockNotFound,
    Web3Exception,
)
from web3.providers.rpc.utils import (
    ExceptionRetryConfiguration,
    REQUEST_RETRY_ALLOWLIST,
)
from dotenv import load_dotenv

load_dotenv()
pd.options.display.float_format = "{:20,.4f}".format

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    handlers=[logging.StreamHandler()],
)

ETHERSCAN_API_KEY_DICT = {
    "hearthquake": {
        "INFURA_URL": os.getenv("INFURA_URL_HEARTHQUAKE"),
        "ETHERSCAN_API_KEY": os.getenv("ETHERSCAN_API_KEY"),
    },
    "opensee": {
        "INFURA_URL": os.getenv("INFURA_URL_OPENSEE"),
        "ETHERSCAN_API_KEY": os.getenv("ETHERSCAN_API_KEY"),
    },
    "eco": {
        "INFURA_URL": os.getenv("INFURA_URL_ECO"),
        "ETHERSCAN_API_KEY": os.getenv("ETHERSCAN_API_KEY"),
    },
}

INFURA_URL = ETHERSCAN_API_KEY_DICT["hearthquake"]["INFURA_URL"]
ETHERSCAN_API_KEY = ETHERSCAN_API_KEY_DICT["hearthquake"]["ETHERSCAN_API_KEY"]
UNISWAP_V2_CONTRACT = "0x5C69bEe701ef814a2B6a3EDD4B1652CB9cc5aA6f"

OUTPUT_FILE = "out/V2/V2_final_tx.jsonl"
STATE_FILE = "out/V2/V2_final_scan_state.json"
TOKEN_NAME_FILE = "out/V2/V2_token_name.json"
V2_EVENT_BY_CONTRACTS = "out/V2/uniswap_v2_pairs_events.json"
DB_PATH = "out/V2/uniswap_v2_events.duckdb"
GLOBAL_DICT_TOKEN_SYMBOL = {}
if os.path.exists(TOKEN_NAME_FILE):
    with open(TOKEN_NAME_FILE, "r") as f:
        GLOBAL_DICT_TOKEN_SYMBOL = json.load(f)

w3 = Web3(
    Web3.HTTPProvider(
        endpoint_uri=INFURA_URL,
        request_kwargs={"timeout": 30},
        exception_retry_configuration=ExceptionRetryConfiguration(
            errors=(ConnectionError, HTTPError, TimeoutError),
            retries=5,
            backoff_factor=1,
            method_allowlist=REQUEST_RETRY_ALLOWLIST,
        ),
    )
)

assert w3.is_connected(), "Web3 provider connection failed"
print(f"✓ Connected to Ethereum. Latest block: {w3.eth.block_number:,}")

In [None]:
# --------------------
# Helper Function: Get ABI from Etherscan or Disk
# --------------------
def get_abi(contract_address: str, api_key: str) -> list:
    """
    Retrieves the ABI for a given contract address.
    Checks if the ABI is available in the local 'ABI' folder.
    If not, it fetches the ABI from Etherscan using the provided API key,
    then saves it to disk for future use.

    Parameters:
        contract_address (str): The contract address (checksum not required here).
        api_key (str): Your Etherscan API key.

    Returns:
        list: The ABI loaded as a Python list.
    """
    # Ensure the ABI folder exists.
    abi_folder = "ABI"
    if not os.path.exists(abi_folder):
        os.makedirs(abi_folder)

    # Save ABI with filename based on contract address.
    filename = os.path.join(abi_folder, f"{contract_address}.json")

    # If file exists, load and return the ABI.
    if os.path.exists(filename):
        with open(filename, "r") as file:
            abi = json.load(file)
    else:
        try:
            url = f"https://api.etherscan.io/v2/api?chainid=1&module=contract&action=getabi&address={contract_address}&apikey={api_key}"
            response = requests.get(url)
            data = response.json()
            if data["status"] == "1":
                # Parse the ABI and save it for later use.
                abi = json.loads(data["result"])
                with open(filename, "w") as file:
                    json.dump(abi, file)
        except Exception as e:
            Exception(
                f"Error fetching ABI for contract {contract_address}: {data['result']}"
            )
    return abi


# -----------------------
# Helper: Convert event to dict
# -----------------------
def event_to_dict(event):
    d = dict(event)
    if "args" in d:
        d["args"] = dict(d["args"])
    if "transactionHash" in d:
        d["transactionHash"] = d["transactionHash"].hex()
    if "blockHash" in d:
        d["blockHash"] = d["blockHash"].hex()
    return d


class Web3JSONEncoder(json.JSONEncoder):
    def default(self, obj):
        # HexBytes → hex string
        if isinstance(obj, HexBytes):
            return obj.hex()
        # Peel off any other web3-specific types here as needed...
        return super().default(obj)


# -----------------------
# ETHERSCAN VERSION
# Used to find at which block 1 contract has been deployed
# Might be useful later, put it in JSON in the end
# -----------------------
def get_contract_creation_block_etherscan(
    contract_address: str, etherscan_api_key: str
) -> int:
    """
    Retrieves the contract creation block from Etherscan.
    Returns the block number as an integer.
    """
    url = (
        f"https://api.etherscan.io/api?module=contract&action=getcontractcreation"
        f"&contractaddresses={contract_address}&apikey={etherscan_api_key}"
    )
    response = requests.get(url)
    data = response.json()

    if data.get("status") == "1":
        results = data.get("result", [])
        if results and len(results) > 0:
            return int(results[0]["blockNumber"])
        else:
            raise Exception("No contract creation data found.")
    else:
        raise Exception(
            "Error fetching creation block: " + data.get("result", "Unknown error")
        )


# -----------------------
# Used to find at which block 1 contract has been deployed
# Might be useful later, put it in JSON in the end
# -----------------------
def get_contract_creation_block_custom(start_block=0, end_block=100000):

    def get_contract_deployments(start_block, end_block, max_workers=8):
        deployments = []

        def process_block(block_number):
            block = w3.eth.get_block(block_number, full_transactions=True)
            block_deployments = []
            for tx in block.transactions:
                if tx.to is None:
                    try:
                        receipt = w3.eth.get_transaction_receipt(tx.hash)
                        contract_address = receipt.contractAddress
                        if contract_address:
                            block_deployments.append(
                                {
                                    "block_number": block_number,
                                    "contract_address": contract_address,
                                }
                            )
                    except:
                        print(tx.hash)
            return block_deployments

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_block = {
                executor.submit(process_block, bn): bn
                for bn in range(start_block, end_block + 1)
            }
            for future in as_completed(future_to_block):
                block_deployments = future.result()
                deployments.extend(block_deployments)

        return deployments

    deployments = get_contract_deployments(start_block, end_block)

    # Save the results to a JSON file
    with open("contract_deployments.json", "w") as f:
        json.dump(deployments, f, indent=4)


# -- Step 2: Reconstruct an Event’s Signature --
def get_event_signature(event_name: str, abi: list) -> str:
    """
    Given an event name and an ABI, find the event definition and reconstruct its signature.
    For example, for event Transfer(address,address,uint256) this returns its keccak256 hash.
    """
    from eth_utils import keccak, encode_hex

    for item in abi:
        if item.get("type") == "event" and item.get("name") == event_name:
            # Build the signature string: "Transfer(address,address,uint256)"
            types = ",".join([inp["type"] for inp in item.get("inputs", [])])
            signature = f"{event_name}({types})"
            return encode_hex(keccak(text=signature))
    raise ValueError(f"Event {event_name} not found in ABI.")


def block_to_utc(block_number):
    """
    Convert a block number into its UTC timestamp.

    Parameters:
        w3 (Web3): A Web3 instance
        block_number (int): The block number

    Returns:
        datetime: The block timestamp in UTC
    """
    block = w3.eth.get_block(block_number)
    timestamp = block["timestamp"]
    return datetime.fromtimestamp(timestamp, tz=timezone.utc).isoformat()


def read_and_sort_jsonl(file_path):
    """
    Reads a JSONL file, each line being a JSON object with a field `blockNumber`,
    and returns a list of those objects sorted by blockNumber (ascending).
    """
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                # Handle bad JSON if needed, e.g., log or skip
                print(line)
                print(f"Skipping bad JSON line: {e}")
                continue
            # Optionally, you could check that 'blockNumber' exists, is int, etc.
            if "blockNumber" not in obj:
                print(f"Skipping line with no blockNumber: {obj}")
                continue
            data.append(obj)
    # Now sort by blockNumber ascending
    # If blockNumber in file is already int, fine; else convert
    sorted_data = sorted(data, key=lambda o: int(o["blockNumber"]))
    return sorted_data


def get_address_abi_contract(contract_address, etherscan_api_key=ETHERSCAN_API_KEY):
    address = w3.to_checksum_address(contract_address)
    contract_abi = get_abi(address, etherscan_api_key)
    contract = w3.eth.contract(address=contract_address, abi=contract_abi)

    return address, contract_abi, contract


# Find the amount of token depending on the contract at the very specific block_number
# but it use ETHERSCAN API (to go further: explorer the reconstruct from all the Transfer event but slow)
# Not super useful for the moment
def get_erc20_balance_at_block(user_address, token_address, block_number):
    """
    Query ERC-20 balance of an address at a specific block.

    user_address = "0xe2dFC8F41DB4169A24e7B44095b9E92E20Ed57eD"
    token_address = "0x514910771AF9Ca656af840dff83E8264EcF986CA"
    block_number = 23405236
    balance = get_erc20_balance_at_block(user_address, token_address, block_number)

    Parameters:
        user_address: string, account to check
        token_address: Web3 contract instance for the ERC-20 token
        block_number: int, historical block

    Returns:
        int: token balance
        None if contract is a proxy
    """
    token_address, token_abi, token_contract = get_address_abi_contract(token_address)
    user_address = w3.to_checksum_address(user_address)
    token_name = None
    token_symbol = None
    try:
        token_name = token_contract.functions.name().call()
        token_symbol = token_contract.functions.symbol().call()
    except Exception as e:
        print(f"Error {e}")
        print(f"{token_address}")
        return None
    balance = token_contract.functions.balanceOf(user_address).call(
        block_identifier=block_number
    )
    print(
        f"Address {user_address} had {w3.from_wei(balance, "ether")} of {token_symbol} at block {block_number}"
    )
    return balance


def get_token_name_by_contract(
    token_address,
    TOKEN_NAME_FILE=TOKEN_NAME_FILE,
    proxy_address=None,
    global_cache=GLOBAL_DICT_TOKEN_SYMBOL,
):
    """
    Returns the token name for `token_address`, using a local JSON cache.
    If not in cache, will call get_token_name_by_contract (your ABI/Web3 function),
    store the result (or None) in the cache file, and return it.
    """
    # 1. Load cache
    cache = global_cache
    # if os.path.exists(TOKEN_NAME_FILE):
    #     try:
    #         with open(TOKEN_NAME_FILE, "r", encoding="utf-8") as f:
    #             cache = json.load(f)
    #     except Exception as e:
    #         # If file is corrupted, proceed with empty cache
    #         print(f"Warning: cannot read token name cache: {e}")

    # 2. Check cache
    if token_address in cache:
        return cache[token_address]

    # Not in cache → fetch from contract
    name = None
    symbol = None
    address = None
    try:
        if proxy_address:
            proxy_address, proxy_abi, proxy_contract = get_address_abi_contract(
                proxy_address
            )
            token_address = proxy_contract.functions.getToken(token_address).call()
        token_address, token_abi, token_contract = get_address_abi_contract(
            token_address
        )
        # call name
        name_raw = token_contract.functions.name().call()
        symbol_raw = token_contract.functions.symbol().call()
        address = token_contract.address
        # Convert raw to str if needed
        name = str(name_raw)
        if isinstance(name_raw, (bytes, bytearray)):
            name = name_raw.decode("utf-8", errors="ignore").rstrip("\x00")
        symbol = str(symbol_raw)
        if isinstance(symbol_raw, (bytes, bytearray)):
            symbol = symbol_raw.decode("utf-8", errors="ignore").rstrip("\x00")
    except Exception as e:
        print(f"Error fetching token name/symbol for {address}: {e}")
        if token_address:
            cache[token_address] = {
                "name": None,
                "symbol": None,
                "address": None,
            }
        try:
            dirn = os.path.dirname(TOKEN_NAME_FILE) or "."
            fd, tmp = tempfile.mkstemp(dir=dirn, text=True)
            with os.fdopen(fd, "w", encoding="utf-8") as f:
                json.dump(cache, f, indent=2, ensure_ascii=False)
            os.replace(tmp, TOKEN_NAME_FILE)
        except Exception as e:
            print(f"Warning: failed to save token cache: {e}")
        return {
            "name": None,
            "symbol": None,
            "address": None,
        }

    # Update cache
    cache[address] = {
        "name": name,
        "symbol": symbol,
        "address": address,
    }

    # Write back atomically (overwrite)
    try:
        dirn = os.path.dirname(TOKEN_NAME_FILE) or "."
        fd, tmp = tempfile.mkstemp(dir=dirn, text=True)
        with os.fdopen(fd, "w", encoding="utf-8") as f:
            json.dump(cache, f, indent=2, ensure_ascii=False)
        os.replace(tmp, TOKEN_NAME_FILE)
    except Exception as e:
        print(f"Warning: failed to save token cache: {e}")

    return cache[address]


def decode_topics(log):
    _, abi, contract = get_address_abi_contract(log["address"])
    # Try matching this log against the ABI events
    for item in abi:
        if item.get("type") == "event":
            event_signature = (
                f'{item["name"]}({",".join(i["type"] for i in item["inputs"])})'
            )
            event_hash = w3.keccak(text=event_signature).hex()

            if log["topics"][0].hex() == event_hash:
                # Found matching event
                decoded = contract.events[item["name"]]().process_log(log)
                return {
                    "event": item["name"],
                    "args": dict(decoded["args"]),
                }

    return {}  # no matching event in ABI


def release_list(a):
    del a[:]
    del a

In [None]:
# Cell 1: Database Functions (FIXED - VARCHAR for large numbers)

import signal
from threading import Lock, Event as ThreadEvent

_db_path = None
_connection_lock = Lock()
_shutdown_event = ThreadEvent()


def signal_handler(signum, frame):
    global _shutdown_event
    logging.warning("\n⚠️  Shutdown signal received - finishing current ranges...")
    _shutdown_event.set()


signal.signal(signal.SIGINT, signal_handler)


def is_shutdown_requested():
    return _shutdown_event.is_set()


def setup_database(db_path):
    global _db_path
    _db_path = db_path

    conn = duckdb.connect(db_path)

    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS transfer (
            transaction_hash VARCHAR NOT NULL,
            block_number BIGINT NOT NULL,
            log_index INTEGER NOT NULL,
            pair_address VARCHAR NOT NULL,
            from_address VARCHAR NOT NULL,
            to_address VARCHAR NOT NULL,
            value VARCHAR NOT NULL,
            inserted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (transaction_hash, log_index)
        )
    """
    )

    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS swap (
            transaction_hash VARCHAR NOT NULL,
            block_number BIGINT NOT NULL,
            log_index INTEGER NOT NULL,
            pair_address VARCHAR NOT NULL,
            sender VARCHAR NOT NULL,
            to_address VARCHAR NOT NULL,
            amount0_in VARCHAR NOT NULL,
            amount1_in VARCHAR NOT NULL,
            amount0_out VARCHAR NOT NULL,
            amount1_out VARCHAR NOT NULL,
            inserted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (transaction_hash, log_index)
        )
    """
    )

    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS mint (
            transaction_hash VARCHAR NOT NULL,
            block_number BIGINT NOT NULL,
            log_index INTEGER NOT NULL,
            pair_address VARCHAR NOT NULL,
            sender VARCHAR NOT NULL,
            amount0 VARCHAR NOT NULL,
            amount1 VARCHAR NOT NULL,
            inserted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (transaction_hash, log_index)
        )
    """
    )

    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS burn (
            transaction_hash VARCHAR NOT NULL,
            block_number BIGINT NOT NULL,
            log_index INTEGER NOT NULL,
            pair_address VARCHAR NOT NULL,
            sender VARCHAR NOT NULL,
            to_address VARCHAR NOT NULL,
            amount0 VARCHAR NOT NULL,
            amount1 VARCHAR NOT NULL,
            inserted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (transaction_hash, log_index)
        )
    """
    )

    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS sync (
            transaction_hash VARCHAR NOT NULL,
            block_number BIGINT NOT NULL,
            log_index INTEGER NOT NULL,
            pair_address VARCHAR NOT NULL,
            reserve0 VARCHAR NOT NULL,
            reserve1 VARCHAR NOT NULL,
            inserted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (transaction_hash, log_index)
        )
    """
    )

    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS approval (
            transaction_hash VARCHAR NOT NULL,
            block_number BIGINT NOT NULL,
            log_index INTEGER NOT NULL,
            pair_address VARCHAR NOT NULL,
            owner VARCHAR NOT NULL,
            spender VARCHAR NOT NULL,
            value VARCHAR NOT NULL,
            inserted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (transaction_hash, log_index)
        )
    """
    )

    conn.execute(
        "CREATE INDEX IF NOT EXISTS idx_transfer_block ON transfer(block_number)"
    )
    conn.execute(
        "CREATE INDEX IF NOT EXISTS idx_transfer_pair ON transfer(pair_address)"
    )
    conn.execute("CREATE INDEX IF NOT EXISTS idx_swap_block ON swap(block_number)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_swap_pair ON swap(pair_address)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_mint_block ON mint(block_number)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_mint_pair ON mint(pair_address)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_burn_block ON burn(block_number)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_burn_pair ON burn(pair_address)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_sync_block ON sync(block_number)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_sync_pair ON sync(pair_address)")
    conn.execute(
        "CREATE INDEX IF NOT EXISTS idx_approval_block ON approval(block_number)"
    )
    conn.execute(
        "CREATE INDEX IF NOT EXISTS idx_approval_pair ON approval(pair_address)"
    )

    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS processing_state (
            start_block BIGINT NOT NULL,
            end_block BIGINT NOT NULL,
            status VARCHAR NOT NULL,
            worker_id VARCHAR,
            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (start_block, end_block)
        )
    """
    )

    conn.close()
    print("✓ Database schema created successfully")


def batch_insert_events(events, worker_id="main"):
    if not events:
        return 0

    transfers = []
    swaps = []
    mints = []
    burns = []
    syncs = []
    approvals = []

    for e in events:
        event_type = e.get("event")
        args = e.get("args", {})

        if event_type == "Transfer":
            transfers.append(
                (
                    e["transactionHash"],
                    e["blockNumber"],
                    e.get("logIndex", 0),
                    e["address"],
                    args.get("from", ""),
                    args.get("to", ""),
                    str(args.get("value", 0)),
                )
            )

        elif event_type == "Swap":
            swaps.append(
                (
                    e["transactionHash"],
                    e["blockNumber"],
                    e.get("logIndex", 0),
                    e["address"],
                    args.get("sender", ""),
                    args.get("to", ""),
                    str(args.get("amount0In", 0)),
                    str(args.get("amount1In", 0)),
                    str(args.get("amount0Out", 0)),
                    str(args.get("amount1Out", 0)),
                )
            )

        elif event_type == "Mint":
            mints.append(
                (
                    e["transactionHash"],
                    e["blockNumber"],
                    e.get("logIndex", 0),
                    e["address"],
                    args.get("sender", ""),
                    str(args.get("amount0", 0)),
                    str(args.get("amount1", 0)),
                )
            )

        elif event_type == "Burn":
            burns.append(
                (
                    e["transactionHash"],
                    e["blockNumber"],
                    e.get("logIndex", 0),
                    e["address"],
                    args.get("sender", ""),
                    args.get("to", ""),
                    str(args.get("amount0", 0)),
                    str(args.get("amount1", 0)),
                )
            )

        elif event_type == "Sync":
            syncs.append(
                (
                    e["transactionHash"],
                    e["blockNumber"],
                    e.get("logIndex", 0),
                    e["address"],
                    str(args.get("reserve0", 0)),
                    str(args.get("reserve1", 0)),
                )
            )

        elif event_type == "Approval":
            approvals.append(
                (
                    e["transactionHash"],
                    e["blockNumber"],
                    e.get("logIndex", 0),
                    e["address"],
                    args.get("owner", ""),
                    args.get("spender", ""),
                    str(args.get("value", 0)),
                )
            )

    with _connection_lock:
        conn = duckdb.connect(_db_path)

        try:
            if transfers:
                conn.executemany(
                    """
                    INSERT INTO transfer (transaction_hash, block_number, log_index, pair_address, 
                                         from_address, to_address, value)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                    ON CONFLICT (transaction_hash, log_index) DO NOTHING
                """,
                    transfers,
                )

            if swaps:
                conn.executemany(
                    """
                    INSERT INTO swap (transaction_hash, block_number, log_index, pair_address,
                                     sender, to_address, amount0_in, amount1_in, amount0_out, amount1_out)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                    ON CONFLICT (transaction_hash, log_index) DO NOTHING
                """,
                    swaps,
                )

            if mints:
                conn.executemany(
                    """
                    INSERT INTO mint (transaction_hash, block_number, log_index, pair_address,
                                     sender, amount0, amount1)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                    ON CONFLICT (transaction_hash, log_index) DO NOTHING
                """,
                    mints,
                )

            if burns:
                conn.executemany(
                    """
                    INSERT INTO burn (transaction_hash, block_number, log_index, pair_address,
                                     sender, to_address, amount0, amount1)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                    ON CONFLICT (transaction_hash, log_index) DO NOTHING
                """,
                    burns,
                )

            if syncs:
                conn.executemany(
                    """
                    INSERT INTO sync (transaction_hash, block_number, log_index, pair_address,
                                     reserve0, reserve1)
                    VALUES (?, ?, ?, ?, ?, ?)
                    ON CONFLICT (transaction_hash, log_index) DO NOTHING
                """,
                    syncs,
                )

            if approvals:
                conn.executemany(
                    """
                    INSERT INTO approval (transaction_hash, block_number, log_index, pair_address,
                                         owner, spender, value)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                    ON CONFLICT (transaction_hash, log_index) DO NOTHING
                """,
                    approvals,
                )

            total = (
                len(transfers)
                + len(swaps)
                + len(mints)
                + len(burns)
                + len(syncs)
                + len(approvals)
            )
            logging.info(
                f"[{worker_id}] Inserted {total} events (T:{len(transfers)} S:{len(swaps)} M:{len(mints)} B:{len(burns)} Sy:{len(syncs)} A:{len(approvals)})"
            )
            return total

        finally:
            conn.close()


def mark_range_completed(start_block, end_block, worker_id="main"):
    with _connection_lock:
        conn = duckdb.connect(_db_path)
        try:
            conn.execute(
                """
                INSERT INTO processing_state (start_block, end_block, status, worker_id, updated_at)
                VALUES (?, ?, 'completed', ?, NOW())
                ON CONFLICT (start_block, end_block) 
                DO UPDATE SET 
                    status = 'completed', 
                    worker_id = ?,
                    updated_at = NOW()
            """,
                (start_block, end_block, worker_id, worker_id),
            )
        finally:
            conn.close()


def mark_range_processing(start_block, end_block, worker_id="main"):
    with _connection_lock:
        conn = duckdb.connect(_db_path)
        try:
            conn.execute(
                """
                INSERT INTO processing_state (start_block, end_block, status, worker_id, updated_at)
                VALUES (?, ?, 'processing', ?, NOW())
                ON CONFLICT (start_block, end_block) 
                DO UPDATE SET 
                    status = 'processing',
                    worker_id = ?,
                    updated_at = NOW()
            """,
                (start_block, end_block, worker_id, worker_id),
            )
        finally:
            conn.close()


def get_completed_ranges():
    with _connection_lock:
        conn = duckdb.connect(_db_path)
        try:
            result = conn.execute(
                """
                SELECT start_block, end_block 
                FROM processing_state 
                WHERE status = 'completed'
            """
            ).fetchall()
            return set((r[0], r[1]) for r in result)
        finally:
            conn.close()


def get_database_stats():
    with _connection_lock:
        conn = duckdb.connect(_db_path)
        try:
            completed_count = conn.execute(
                """
                SELECT COUNT(*) FROM processing_state WHERE status = 'completed'
            """
            ).fetchone()[0]

            stats = {
                "total_transfers": conn.execute(
                    "SELECT COUNT(*) FROM transfer"
                ).fetchone()[0],
                "total_swaps": conn.execute("SELECT COUNT(*) FROM swap").fetchone()[0],
                "total_mints": conn.execute("SELECT COUNT(*) FROM mint").fetchone()[0],
                "total_burns": conn.execute("SELECT COUNT(*) FROM burn").fetchone()[0],
                "total_syncs": conn.execute("SELECT COUNT(*) FROM sync").fetchone()[0],
                "total_approvals": conn.execute(
                    "SELECT COUNT(*) FROM approval"
                ).fetchone()[0],
                "completed_ranges": completed_count,
            }
            return stats
        finally:
            conn.close()


print("✓ Database functions loaded")

In [None]:
# Cell 2: Web3 Pool Functions

_providers = []
_provider_names = []
_current_index = 0
_provider_lock = Lock()


def setup_web3_pool(api_key_dict):
    global _providers, _provider_names

    for name, config in api_key_dict.items():
        provider = Web3(
            Web3.HTTPProvider(
                endpoint_uri=config["INFURA_URL"],
                request_kwargs={"timeout": 30},
                exception_retry_configuration=ExceptionRetryConfiguration(
                    errors=(ConnectionError, HTTPError, TimeoutError),
                    retries=5,
                    backoff_factor=1,
                    method_allowlist=REQUEST_RETRY_ALLOWLIST,
                ),
            )
        )

        if provider.is_connected():
            _providers.append(provider)
            _provider_names.append(name)
            logging.info(f"✓ Provider '{name}' connected")
        else:
            logging.warning(f"✗ Provider '{name}' failed to connect")

    if not _providers:
        raise Exception("No providers connected!")


def get_provider():
    global _current_index

    with _provider_lock:
        provider = _providers[_current_index]
        name = _provider_names[_current_index]
        _current_index = (_current_index + 1) % len(_providers)
        return provider, name


print("✓ Web3 pool functions loaded")

In [None]:
# Cell 4: Scanning Functions (updated to use decode_topics)


def fetch_logs_for_range(
    start_block, end_block, addresses, worker_id="main", retry_count=0, max_retries=5
):
    provider, provider_name = get_provider()

    try:
        params = {
            "fromBlock": start_block,
            "toBlock": end_block,
            "address": addresses,
        }

        logs = provider.eth.get_logs(params)

        transactions = []
        for log in logs:
            transaction = {
                "transactionHash": provider.to_hex(log["transactionHash"]),
                "blockNumber": log["blockNumber"],
                "logIndex": log.get("logIndex", 0),
                "address": log["address"],
                "data": provider.to_hex(log["data"]),
            }

            topics = decode_topics(log)
            transaction.update(topics)

            if log.get("topics") and len(log["topics"]) > 0:
                transaction["eventSignature"] = provider.to_hex(log["topics"][0])
            else:
                transaction["eventSignature"] = ""

            transactions.append(transaction)

        logging.info(
            f"[{worker_id}] [{provider_name}] Fetched {len(transactions)} events from blocks [{start_block:,}, {end_block:,}]"
        )
        return transactions

    except HTTPError as e:
        if e.response.status_code == 429:
            if retry_count < max_retries:
                wait_time = 2**retry_count
                logging.warning(
                    f"[{worker_id}] [{provider_name}] Rate limit hit, waiting {wait_time}s..."
                )
                time.sleep(wait_time)
                return fetch_logs_for_range(
                    start_block,
                    end_block,
                    addresses,
                    worker_id,
                    retry_count + 1,
                    max_retries,
                )
            else:
                logging.error(f"[{worker_id}] Max retries reached")
                raise
        elif e.response.status_code == 402:
            logging.critical(f"[{worker_id}] Payment required (402)")
            raise
        else:
            logging.error(f"[{worker_id}] HTTP error {e.response.status_code}: {e}")
            raise

    except Web3RPCError as e:
        if "more than 10000 results" in str(e) or "-32005" in str(e):
            raise
        else:
            logging.error(f"[{worker_id}] Web3 RPC error: {e}")
            raise


def process_block_range(start_block, end_block, addresses, worker_id="main"):
    if is_shutdown_requested():
        logging.info(
            f"[{worker_id}] Shutdown requested - skipping range [{start_block:,}, {end_block:,}]"
        )
        return 0

    if (start_block, end_block) in get_completed_ranges():
        logging.info(
            f"[{worker_id}] Skipping already processed range [{start_block:,}, {end_block:,}]"
        )
        return 0

    mark_range_processing(start_block, end_block, worker_id)

    try:
        events = fetch_logs_for_range(start_block, end_block, addresses, worker_id)

        if is_shutdown_requested():
            logging.warning(
                f"[{worker_id}] Shutdown requested after fetch - saving {len(events)} events before stopping"
            )

        batch_insert_events(events, worker_id)
        mark_range_completed(start_block, end_block, worker_id)

        logging.info(
            f"[{worker_id}] ✓ Processed [{start_block:,}, {end_block:,}] - {len(events)} events"
        )
        return len(events)

    except Web3RPCError as e:
        if "more than 10000 results" in str(e) or "-32005" in str(e):
            mid = (start_block + end_block) // 2

            if mid == start_block:
                logging.error(
                    f"[{worker_id}] Cannot split range [{start_block:,}, {end_block:,}] further"
                )
                return 0

            logging.info(
                f"[{worker_id}] Splitting [{start_block:,}, {end_block:,}] at {mid:,}"
            )

            count1 = process_block_range(start_block, mid, addresses, worker_id)

            if is_shutdown_requested():
                logging.warning(
                    f"[{worker_id}] Shutdown requested - skipping second half of split"
                )
                return count1

            count2 = process_block_range(mid + 1, end_block, addresses, worker_id)

            return count1 + count2
        else:
            logging.error(
                f"[{worker_id}] Failed to process [{start_block:,}, {end_block:,}]: {e}"
            )
            return 0

    except Exception as e:
        logging.error(f"[{worker_id}] Unexpected error: {e}")
        logging.error(traceback.format_exc())
        return 0


def generate_block_ranges(start_block, end_block, chunk_size):
    completed = get_completed_ranges()

    ranges = []
    current = start_block

    while current <= end_block:
        end = min(current + chunk_size - 1, end_block)

        if (current, end) not in completed:
            ranges.append((current, end))
        else:
            logging.info(f"Skipping completed range [{current:,}, {end:,}]")

        current = end + 1

    return ranges


def scan_blockchain(addresses, start_block, end_block, chunk_size=10000, max_workers=3):
    ranges = generate_block_ranges(start_block, end_block, chunk_size)

    if not ranges:
        logging.info("No ranges to process - all already completed!")
        return

    total_ranges = len(ranges)
    logging.info(f"Processing {total_ranges} block ranges with {max_workers} workers")

    total_events = 0
    completed_ranges = 0

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_range = {
            executor.submit(
                process_block_range,
                start,
                end,
                addresses,
                f"worker-{i % max_workers}",
            ): (start, end, i)
            for i, (start, end) in enumerate(ranges)
        }

        try:
            for future in as_completed(future_to_range):
                if is_shutdown_requested():
                    logging.warning(
                        "Shutdown requested - waiting for active tasks to complete..."
                    )
                    break

                start, end, idx = future_to_range[future]

                try:
                    event_count = future.result()
                    total_events += event_count
                    completed_ranges += 1

                    progress = (completed_ranges / total_ranges) * 100
                    logging.info(
                        f"Progress: {completed_ranges}/{total_ranges} ({progress:.1f}%) | "
                        f"Total events: {total_events:,}"
                    )

                except Exception as e:
                    logging.error(f"Range [{start:,}, {end:,}] failed: {e}")

            if is_shutdown_requested():
                logging.warning(
                    "Waiting for active workers to finish their current ranges..."
                )
                executor.shutdown(wait=True, cancel_futures=True)
                logging.info(
                    f"✓ Graceful shutdown complete. Processed {completed_ranges}/{total_ranges} ranges."
                )

        except KeyboardInterrupt:
            logging.warning(
                "\n⚠️  Additional Ctrl+C detected - forcing immediate shutdown..."
            )
            executor.shutdown(wait=False, cancel_futures=True)
            raise

    logging.info(f"\n{'='*60}")
    logging.info(f"Scan completed!")
    logging.info(f"Total events fetched: {total_events:,}")
    logging.info(f"Ranges processed: {completed_ranges}/{total_ranges}")
    logging.info(f"{'='*60}\n")


print("✓ Scanning functions loaded")

In [None]:
# Cell 5: Main Scanning Function (UPDATED stats display)


def scan_blockchain_to_duckdb(
    event_file=V2_EVENT_BY_CONTRACTS,
    db_path="out/V2/uniswap_v2_events.duckdb",
    start_block=10000001,
    end_block=20000000,
    chunk_size=10000,
    max_workers=3,
    token_filter=None,
):

    logging.info("=" * 60)
    logging.info("BLOCKCHAIN SCANNER STARTING")
    logging.info("=" * 60)

    logging.info(f"Loading addresses from {event_file}...")
    with open(event_file, "r") as f:
        all_pairs = json.load(f)

    all_addresses = [Web3.to_checksum_address(addr) for addr in all_pairs.keys()]

    if token_filter:
        filter_checksummed = [Web3.to_checksum_address(addr) for addr in token_filter]
        addresses = [addr for addr in all_addresses if addr in filter_checksummed]
        logging.info(
            f"Filtered to {len(addresses)} addresses from {len(all_addresses)} total"
        )
    else:
        addresses = all_addresses
        logging.info(f"Using all {len(addresses)} addresses")

    logging.info("Setting up Web3 connection pool...")
    setup_web3_pool(ETHERSCAN_API_KEY_DICT)

    logging.info(f"Setting up database at {db_path}...")
    setup_database(db_path)

    stats = get_database_stats()
    logging.info("\nCurrent database stats:")
    logging.info(f"  Transfers: {stats['total_transfers']:,}")
    logging.info(f"  Swaps: {stats['total_swaps']:,}")
    logging.info(f"  Mints: {stats['total_mints']:,}")
    logging.info(f"  Burns: {stats['total_burns']:,}")
    logging.info(f"  Syncs: {stats['total_syncs']:,}")
    logging.info(f"  Approvals: {stats['total_approvals']:,}")
    logging.info(f"  Completed ranges: {stats['completed_ranges']}")

    logging.info(f"\nStarting scan:")
    logging.info(f"  Block range: {start_block:,} to {end_block:,}")
    logging.info(f"  Chunk size: {chunk_size:,}")
    logging.info(f"  Workers: {max_workers}")
    logging.info(f"  Addresses: {len(addresses)}")
    logging.info("=" * 60 + "\n")

    try:
        scan_blockchain(addresses, start_block, end_block, chunk_size, max_workers)

        final_stats = get_database_stats()
        logging.info("\n" + "=" * 60)
        logging.info("SCAN COMPLETED!")
        logging.info("=" * 60)
        logging.info(f"  Transfers: {final_stats['total_transfers']:,}")
        logging.info(f"  Swaps: {final_stats['total_swaps']:,}")
        logging.info(f"  Mints: {final_stats['total_mints']:,}")
        logging.info(f"  Burns: {final_stats['total_burns']:,}")
        logging.info(f"  Syncs: {final_stats['total_syncs']:,}")
        logging.info(f"  Approvals: {final_stats['total_approvals']:,}")
        logging.info("=" * 60)

    except KeyboardInterrupt:
        logging.info("\n\nInterrupted by user - progress saved to database")
        logging.info("You can restart to continue from where it left off")
    except Exception as e:
        logging.error(f"Fatal error: {e}", exc_info=True)


def query_database(db_path="out/V2/uniswap_v2_events.duckdb"):

    conn = duckdb.connect(db_path, read_only=True)

    try:
        print("\n" + "=" * 60)
        print("DATABASE QUERIES")
        print("=" * 60)

        print("\n1. Event counts by type:")
        print(
            f"  Transfers: {conn.execute('SELECT COUNT(*) FROM transfer').fetchone()[0]:,}"
        )
        print(f"  Swaps: {conn.execute('SELECT COUNT(*) FROM swap').fetchone()[0]:,}")
        print(f"  Mints: {conn.execute('SELECT COUNT(*) FROM mint').fetchone()[0]:,}")
        print(f"  Burns: {conn.execute('SELECT COUNT(*) FROM burn').fetchone()[0]:,}")
        print(f"  Syncs: {conn.execute('SELECT COUNT(*) FROM sync').fetchone()[0]:,}")
        print(
            f"  Approvals: {conn.execute('SELECT COUNT(*) FROM approval').fetchone()[0]:,}"
        )

        print("\n2. Most active pairs (by swaps):")
        result = conn.execute(
            """
            SELECT pair_address, COUNT(*) as swap_count
            FROM swap
            GROUP BY pair_address
            ORDER BY swap_count DESC
            LIMIT 10
        """
        ).fetchdf()
        print(result)

        print("\n3. Swap volume by pair:")
        result = conn.execute(
            """
            SELECT 
                pair_address,
                SUM(amount0_in + amount0_out) as total_amount0,
                SUM(amount1_in + amount1_out) as total_amount1
            FROM swap
            GROUP BY pair_address
            ORDER BY total_amount0 DESC
            LIMIT 10
        """
        ).fetchdf()
        print(result)

        return result

    finally:
        conn.close()


print("✓ Main functions loaded")

In [None]:
token_filter = [
    "0xB4e16d0168e52d35CaCD2c6185b44281Ec28C9Dc",
    "0x3139Ffc91B99aa94DA8A2dc13f1fC36F9BDc98eE",
    "0x12EDE161c702D1494612d19f05992f43aa6A26FB",
    "0xA478c2975Ab1Ea89e8196811F51A7B7Ade33eB11",
    "0x07F068ca326a469Fc1d87d85d448990C8cBa7dF9",
    "0xAE461cA67B15dc8dc81CE7615e0320dA1A9aB8D5",
    "0xCe407CD7b95B39d3B4d53065E711e713dd5C5999",
    "0x33C2d48Bc95FB7D0199C5C693e7a9F527145a9Af",
]

START_BLOCK = 10000000
END_BLOCK = 10500000
# END_BLOCK = w3.eth.block_number

print(f"Scanning from block {START_BLOCK:,} to {END_BLOCK:,}")

db = scan_blockchain_to_duckdb(
    event_file=V2_EVENT_BY_CONTRACTS,
    db_path=DB_PATH,
    start_block=START_BLOCK,
    end_block=END_BLOCK,
    chunk_size=10000,
    max_workers=3,
    token_filter=token_filter,
)

In [None]:
def get_logs_with_chunking(
    get_logs_fn,
    from_block,
    to_block,
    argument_filters=None,
    initial_chunk_size=10000,
    max_retries=5,
    base_delay=0.5,
):
    """
    Fetch logs with automatic chunking and rate limit handling.
    """

    if argument_filters is None:
        argument_filters = {}

    # Convert 'latest' to actual block number
    if to_block == "latest":
        to_block = w3.eth.block_number

    def fetch_with_retry(start, end, retries=0):
        """Fetch logs with exponential backoff on rate limit errors."""
        try:
            time.sleep(base_delay)

            logs = get_logs_fn(
                from_block=start, to_block=end, argument_filters=argument_filters
            )
            return logs

        except HTTPError as e:
            if "429" in str(e) or "Too Many Requests" in str(e):
                if retries < max_retries:
                    wait_time = base_delay * (2**retries)
                    print(
                        f"  ⚠ Rate limit hit, waiting {wait_time:.1f}s... (retry {retries + 1}/{max_retries})"
                    )
                    time.sleep(wait_time)
                    return fetch_with_retry(start, end, retries + 1)
                else:
                    print(f"  ✗ Max retries reached")
                    raise
            else:
                raise

    def fetch_range(start, end, chunk_size):
        """Recursive function to fetch a block range with dynamic chunking."""

        if end - start <= chunk_size:
            try:
                print(f"Fetching blocks {start} to {end} ({end - start + 1} blocks)...")
                logs = fetch_with_retry(start, end)
                print(f"  ✓ Got {len(logs)} logs")
                return logs

            except HTTPError as e:
                raise

            except Exception as e:
                error_str = str(e)

                if "-32005" in error_str or "more than 10000 results" in error_str:
                    print(f"  ⚠ Too many results, splitting...")

                    mid = (start + end) // 2

                    if mid == start:
                        print(f"  ⚠ Cannot split further")
                        try:
                            if hasattr(e, "args") and len(e.args) > 0:
                                error_data = e.args[0]
                                if (
                                    isinstance(error_data, dict)
                                    and "data" in error_data
                                ):
                                    suggested_to = int(
                                        error_data["data"].get("to", hex(end)), 16
                                    )
                                    if suggested_to < end:
                                        print(f"  Using RPC hint: {suggested_to}")
                                        return fetch_range(
                                            start, suggested_to, chunk_size // 2
                                        )
                        except:
                            pass

                        raise Exception(
                            f"Cannot split block range {start}-{end} further"
                        )

                    left_logs = fetch_range(start, mid, chunk_size // 2)
                    right_logs = fetch_range(mid + 1, end, chunk_size // 2)

                    return left_logs + right_logs
                else:
                    print(f"  ✗ Error: {e}")
                    raise

        else:
            print(f"Splitting range {start}-{end} into chunks of {chunk_size}...")
            current = start
            logs = []

            while current <= end:
                chunk_end = min(current + chunk_size - 1, end)
                chunk_logs = fetch_range(current, chunk_end, chunk_size)
                logs.extend(chunk_logs)
                current = chunk_end + 1

            return logs

    print(f"\n{'='*60}")
    print(f"Fetching logs from block {from_block} to {to_block}")
    print(f"{'='*60}")

    all_logs = fetch_range(from_block, to_block, initial_chunk_size)

    print(f"\n{'='*60}")
    print(f"✓ Complete! Total logs fetched: {len(all_logs)}")
    print(f"{'='*60}\n")

    return all_logs


# ============================================================
# MAIN CODE
# ============================================================

# 1. Get the factory contract
address, abi, contract = get_address_abi_contract(UNISWAP_V2_CONTRACT)

start_block = 0
end_block = "latest"

# 2. Fetch all PairCreated events
print("Fetching all PairCreated events from Uniswap V2 Factory...")
pair_created_logs = get_logs_with_chunking(
    get_logs_fn=contract.events.PairCreated().get_logs,
    from_block=start_block,
    to_block=end_block,
    argument_filters={},
    initial_chunk_size=10000,
    max_retries=5,
    base_delay=0.5,
)

print(f"\n{'='*60}")
print(f"Found {len(pair_created_logs)} pairs")
print(f"{'='*60}\n")

# 3. Get event names from one sample pair (all pairs have same interface)
if len(pair_created_logs) > 0:
    sample_pair_address = pair_created_logs[0].args.pair
    print(f"Getting event list from sample pair: {sample_pair_address}")

    pair_address, pair_abi, pair_contract = get_address_abi_contract(
        sample_pair_address
    )
    event_names = [ev.event_name for ev in pair_contract.events]

    print(f"All pairs have these events: {event_names}\n")

    # 4. Build the dictionary structure
    FULL_EVENT_BY_CONTRACTS = {}

    print(f"Building dictionary structure for {len(pair_created_logs)} pairs...")

    for idx, log in enumerate(pair_created_logs):
        pair_addr = log.args.pair

        # Create structure with empty dicts for each event
        FULL_EVENT_BY_CONTRACTS[pair_addr] = {event: {} for event in event_names}

        if (idx + 1) % 100 == 0:
            print(f"  Processed {idx + 1}/{len(pair_created_logs)} pairs...")

    print(f"  ✓ Completed all {len(pair_created_logs)} pairs\n")

    # 5. Save to disk
    output_file = "uniswap_v2_pairs_events.json"

    print(f"Saving to {output_file}...")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(FULL_EVENT_BY_CONTRACTS, f, ensure_ascii=False, indent=4)

    print(f"✓ Saved successfully!")

    # 6. Print summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    print(f"Total pairs: {len(FULL_EVENT_BY_CONTRACTS)}")
    print(f"Events per pair: {len(event_names)}")
    print(f"Event types: {', '.join(event_names)}")
    print(f"Output file: {output_file}")
    print(f"{'='*60}\n")

    # Print first 3 pairs as sample
    print("Sample (first 3 pairs):")
    for idx, (pair_addr, events) in enumerate(
        list(FULL_EVENT_BY_CONTRACTS.items())[:3]
    ):
        print(f"\n{pair_addr}:")
        for event_name in events.keys():
            print(f"  - {event_name}: {{}}")

else:
    print("No pairs found!")

In [None]:
# Important code
# We look for the Genesis Uniswap factory, and we get all its events (Only 1 for the V1_factory: 'NewExchange', Only 1 for the V2_factory: PairCreated)
# Then we scan from 0 to latest block every NexEchange created from this Factory
# (We have the filter of events in case we are filtering events from contract that have multiple events to remove when we don't care)
address, abi, contract = get_address_abi_contract(
    UNISWAP_V2_CONTRACT
)  # Uniswap Genesis Factory
start_block = 0
end_block = 'latest'
# list all event names
event_names = [ev.event_name for ev in contract.events]
print(event_names)
# define which events you want and filters directly
events_to_scan = [
    contract.events.PairCreated().get_logs,
    #contract.events.Transfer().get_logs,
    #contract.events.Approval().get_logs,
]
L_LOGS = [] # IMPORTANT
for get_logs_fn in events_to_scan:
    logs = get_logs_fn(
        from_block=start_block,
        to_block=end_block,
        argument_filters={},  # or {"from": some_address}, {"to": [addr1, addr2]}
    )
    for log in logs:
        # print(log["transactionHash"].hex(), log["blockNumber"], log["event"])
        L_LOGS.append(log)

# Important code we use in combination with the events filter
# We created a list of Exchange created by the Uniswap V1 Factory Contract and we list all their Events
# We create the Dictionnary
# "exchange_address_1": {"event_1": {}, event_2: {}, event_3:{}}
# This dict fed with the code allow us to retrieve every transactions with the events(logs) of this exchange
# we can then sniff Liquidity out of it

FULL_EVENT_BY_CONTRACTS = {}  # IMPORTANT
for log in L_LOGS:
    add, abi, contract = get_address_abi_contract(log.args.exchange)
    event_names = [ev.event_name for ev in contract.events]
    FULL_EVENT_BY_CONTRACTS[add] = {event: {} for event in event_names}
    time.sleep(1)

print(len(FULL_EVENT_BY_CONTRACTS)) 

if not os.path.exists(V2_EVENT_BY_CONTRACTS):
    with open(V2_EVENT_BY_CONTRACTS, "w", encoding="utf-8") as f:
        json.dump(V2_EVENT_BY_CONTRACTS, f, ensure_ascii=False, indent=4)

In [None]:
# Cell: Load ALL events from DuckDB into pandas DataFrame

conn = duckdb.connect(DB_PATH, read_only=True)

df = conn.execute(
    """
    SELECT 
        block_number as block,
        pair_address as address,
        'Transfer' as event,
        CASE 
            WHEN from_address = '0x0000000000000000000000000000000000000000' 
                THEN to_address
            WHEN to_address = '0x0000000000000000000000000000000000000000' 
                THEN from_address
            ELSE from_address
        END as provider,
        CASE 
            WHEN from_address = '0x0000000000000000000000000000000000000000' 
                THEN CAST(value AS DOUBLE) / 1e18
            WHEN to_address = '0x0000000000000000000000000000000000000000' 
                THEN -CAST(value AS DOUBLE) / 1e18
            ELSE 0
        END as value
    FROM transfer
    
    UNION ALL
    
    SELECT 
        block_number as block,
        pair_address as address,
        'Mint' as event,
        sender as provider,
        (CAST(amount0 AS DOUBLE) / 1e18 + CAST(amount1 AS DOUBLE) / 1e18) as value
    FROM mint
    
    UNION ALL
    
    SELECT 
        block_number as block,
        pair_address as address,
        'Burn' as event,
        sender as provider,
        -(CAST(amount0 AS DOUBLE) / 1e18 + CAST(amount1 AS DOUBLE) / 1e18) as value
    FROM burn
    
    UNION ALL
    
    SELECT 
        block_number as block,
        pair_address as address,
        'Swap' as event,
        sender as provider,
        (CAST(amount0_in AS DOUBLE) / 1e18 + CAST(amount0_out AS DOUBLE) / 1e18) as value
    FROM swap
    
    UNION ALL
    
    SELECT 
        block_number as block,
        pair_address as address,
        'Sync' as event,
        pair_address as provider,
        (CAST(reserve0 AS DOUBLE) / 1e18 + CAST(reserve1 AS DOUBLE) / 1e18) as value
    FROM sync
    
    UNION ALL
    
    SELECT 
        block_number as block,
        pair_address as address,
        'Approval' as event,
        owner as provider,
        CAST(value AS DOUBLE) / 1e18 as value
    FROM approval
    
    ORDER BY block, address
"""
).fetchdf()

conn.close()

df["event"] = df["event"].astype("category")
df["provider"] = df["provider"].apply(Web3.to_checksum_address)
df["address"] = df["address"].apply(Web3.to_checksum_address)
df["block"] = df["block"].astype(np.int32)
df["value"] = pd.to_numeric(df["value"], downcast="float")

df["symbol"] = "Unknown"
df["symbol"] = df["symbol"].astype("category")

print(f"Total events: {len(df):,}")
print(f"\nEvent breakdown:")
print(df["event"].value_counts())

df

In [None]:
# 1ST GRAPH, evolution of the UNISWAP v1 (UNI-V1) amount of token issued/burned (GLOBAL TOTAL over block)
# Important to compare the size of every pool but we need to link "value" to either $ or something relevant for comparison
# NEED: df
totals = (
    df.groupby(["block", "address"], as_index=False)["value"]
    .sum()
    .sort_values(["address", "block"])
)
totals["cum_value"] = totals.groupby("address")["value"].cumsum()

# # 2) fill missing blocks only inside each address' span (min..max), then cumulate
# totals = totals.groupby("address", group_keys=False).apply(
#     lambda g: (
#         g.set_index("block")
#         .reindex(range(g["block"].min(), g["block"].max() + 1), fill_value=0)
#         .rename_axis("block")
#         .reset_index()
#         .assign(address=g.name)
#     )
# )
# totals = (
#     totals[["block", "address", "value"]]
#     .sort_values(["address", "block"])
#     .reset_index(drop=True)
# )

pools_of_interest = [
    w3.to_checksum_address(token_filter[0]),
    w3.to_checksum_address(token_filter[1]),
    w3.to_checksum_address(token_filter[2]),
]

# pools_of_interest = ["add_1","add_2","add_3"]
cum_long_sub = totals[totals["address"].isin(pools_of_interest)]

fig = px.area(
    cum_long_sub,
    x="block",
    y="cum_value",
    color="address",
    line_group="address",
    title="Cumulative liquidity evolution per pool",
    labels={"cum_value": "Cumulative liquidity", "address": "Pool address"},
)

# Optionally, you can also do px.line instead of px.area if you prefer lines without fill
fig = px.line(cum_long_sub, x="block", y="cum_value", color="address",
              title="Cumulative liquidity per pool")
# You can also make it not stacked (i.e. overlayed) by doing:
# fig = px.area(
#     cum_long_sub,
#     x="block",
#     y="cum_value",
#     color="address",
#     line_group="address",
#     facet_col=None,
#     # maybe set `groupnorm=None` or other arguments
# )

fig.update_layout(legend_title="Pool address")
fig.show()

In [None]:
def build_block_filter(block_start=None, block_end=None):
    if block_start is not None and block_end is not None:
        return f"AND block_number BETWEEN {block_start} AND {block_end}"
    if block_start is not None:
        return f"AND block_number >= {block_start}"
    if block_end is not None:
        return f"AND block_number <= {block_end}"
    return ""


def calculate_pool_liquidity_pure_sql(
    db_path,
    pair_address,
    block_start=None,
    block_end=None,
    min_share_pct=0.1,
    min_balance_threshold=1e-8,
):
    pair_address = w3.to_checksum_address(pair_address)
    block_filter = build_block_filter(block_start, block_end)

    query = f"""
    WITH provider_events AS (
        SELECT 
            block_number AS block,
            from_address AS provider,
            -CAST(value AS DOUBLE) AS value
        FROM transfer
        WHERE pair_address = '{pair_address}'
            AND from_address != '0x0000000000000000000000000000000000000000'
        {block_filter}
        
        UNION ALL
        
        SELECT 
            block_number AS block,
            to_address AS provider,
            CAST(value AS DOUBLE) AS value
        FROM transfer
        WHERE pair_address = '{pair_address}'
            AND to_address != '0x0000000000000000000000000000000000000000'
        {block_filter}
    ),
    aggregated_events AS (
        SELECT 
            block,
            provider,
            SUM(value) AS value
        FROM provider_events
        GROUP BY block, provider
    ),
    cumulative_balances AS (
        SELECT 
            block,
            provider,
            SUM(value) OVER (PARTITION BY provider ORDER BY block) AS cum_provider
        FROM aggregated_events
    ),
    filtered_balances AS (
        SELECT *
        FROM cumulative_balances
        WHERE cum_provider > {min_balance_threshold}
    ),
    pool_totals AS (
        SELECT 
            block,
            SUM(cum_provider) AS cum_pool
        FROM filtered_balances
        GROUP BY block
    )
    SELECT 
        cb.block,
        cb.provider,
        cb.cum_provider,
        pt.cum_pool,
        CASE 
            WHEN pt.cum_pool < 1e-10 THEN 0.0
            ELSE LEAST(100.0, GREATEST(0.0, (cb.cum_provider / pt.cum_pool * 100)))
        END AS share_pct
    FROM filtered_balances cb
    JOIN pool_totals pt ON cb.block = pt.block
    WHERE (cb.cum_provider / NULLIF(pt.cum_pool, 0) * 100) >= {min_share_pct}
    ORDER BY cb.block, cb.provider
    """

    with duckdb.connect(db_path, read_only=True) as conn:
        df_result = conn.execute(query).fetch_df()

    if df_result.empty:
        return df_result

    df_result["provider_label"] = df_result["provider"].apply(create_provider_label)

    return df_result


def create_provider_label(address):
    checksum_addr = w3.to_checksum_address(address)
    short_addr = f"{checksum_addr[:6]}...{checksum_addr[-4:]}"
    return short_addr


def add_million_block_markers(fig, min_block, max_block):
    start = (min_block // 1_000_000) * 1_000_000
    end = (max_block // 1_000_000 + 1) * 1_000_000 + 1

    for million_block in range(start, end, 1_000_000):
        if min_block <= million_block <= max_block:
            fig.add_vline(
                x=million_block,
                line_width=2,
                line_dash="dash",
                line_color="black",
                opacity=0.4,
                annotation_text=f"{million_block / 1_000_000:.0f}M",
                annotation_position="top",
                annotation_font_size=12,
            )


def plot_staircase_ownership(df):
    fig = go.Figure()
    providers = sorted(df["provider_label"].unique())

    for provider in providers:
        provider_data = df[df["provider_label"] == provider].sort_values("block")

        fig.add_trace(
            go.Scatter(
                x=provider_data["block"],
                y=provider_data["share_pct"],
                name=provider,
                mode="lines",
                line=dict(width=0.5, shape="hv"),
                stackgroup="one",
                groupnorm="",
                hovertemplate="<b>%{fullData.name}</b><br>Block: %{x}<br>Share: %{y:.4f}%<extra></extra>",
            )
        )

    add_million_block_markers(fig, df["block"].min(), df["block"].max())

    fig.update_layout(
        title="Pool Ownership Distribution (Staircase View)",
        hovermode="x",
        yaxis_title="Ownership Share (%)",
        xaxis_title="Block Number",
        legend=dict(
            title="Provider",
            orientation="v",
            yanchor="top",
            y=1,
            xanchor="left",
            x=1.02,
        ),
        yaxis=dict(range=[0, 100]),
    )

    return fig


def plot_absolute_liquidity_staircase(df):
    fig = go.Figure()
    providers = sorted(df["provider_label"].unique())

    for provider in providers:
        provider_data = df[df["provider_label"] == provider].sort_values("block")

        fig.add_trace(
            go.Scatter(
                x=provider_data["block"],
                y=provider_data["cum_provider"],
                name=provider,
                mode="lines",
                line=dict(width=0.5, shape="hv"),
                stackgroup="one",
                hovertemplate="<b>%{fullData.name}</b><br>Block: %{x}<br>Amount: %{y:.6f}<extra></extra>",
            )
        )

    add_million_block_markers(fig, df["block"].min(), df["block"].max())

    fig.update_layout(
        title="Pool Liquidity by Provider (Absolute Values)",
        hovermode="x",
        yaxis_title="Liquidity Amount (Token Units)",
        xaxis_title="Block Number",
        legend=dict(
            title="Provider",
            orientation="v",
            yanchor="top",
            y=1,
            xanchor="left",
            x=1.02,
        ),
    )

    return fig


def calculate_hhi_metrics(df):
    share_clean = np.where(
        np.isinf(df["share_pct"]) | np.isnan(df["share_pct"]), 0, df["share_pct"]
    )
    df = df.assign(share_pct_clean=share_clean)

    hhi_agg = (
        df.groupby("block")
        .agg(
            hhi=("share_pct_clean", lambda x: (x**2).sum()),
            active_providers=("share_pct_clean", lambda x: (x > 0.01).sum()),
        )
        .reset_index()
    )

    return hhi_agg


def add_hhi_zones(fig):
    zones = [
        (0, 1500, "green", "Competitive"),
        (1500, 2500, "yellow", "Moderate"),
        (2500, 10000, "red", "Concentrated"),
    ]

    for y0, y1, color, label in zones:
        fig.add_hrect(
            y0=y0,
            y1=y1,
            fillcolor=color,
            opacity=0.1,
            annotation_text=label,
            secondary_y=False,
        )


def plot_ownership_concentration(df):
    hhi_df = calculate_hhi_metrics(df)

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Scatter(
            x=hhi_df["block"],
            y=hhi_df["hhi"],
            name="HHI (Concentration)",
            line=dict(color="#F46821", width=2),
        ),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(
            x=hhi_df["block"],
            y=hhi_df["active_providers"],
            name="Active Providers",
            line=dict(color="#29BEFD", width=2),
        ),
        secondary_y=True,
    )

    fig.update_layout(title="Pool Concentration Analysis", hovermode="x unified")
    fig.update_xaxes(title_text="Block Number")
    fig.update_yaxes(title_text="HHI Score", secondary_y=False)
    fig.update_yaxes(title_text="Number of Providers", secondary_y=True)

    add_hhi_zones(fig)

    return fig, hhi_df


def get_concentration_status(hhi):
    if hhi < 1500:
        return "✅ COMPETITIVE (Decentralized)"
    elif hhi < 2500:
        return "⚠️  MODERATE CONCENTRATION"
    else:
        return "🔴 HIGHLY CONCENTRATED"


def print_liquidity_summary(df):
    max_block = df["block"].max()

    summary = (
        df.groupby(["provider", "provider_label"])["cum_provider"]
        .last()
        .sort_values(ascending=False)
    )

    print("\n" + "=" * 60)
    print("LIQUIDITY SUMMARY")
    print("=" * 60)

    for (provider, label), amount in summary.items():
        provider_checksum = w3.to_checksum_address(provider)
        final_data = df[
            (df["provider"] == provider_checksum) & (df["block"] == max_block)
        ]

        if not final_data.empty:
            share = final_data["share_pct"].values[0]
            print(f"{label}: {amount:.6f} tokens ({share:.2f}% of pool)")
        else:
            print(f"{label}: {amount:.6f} tokens (exited)")


def print_concentration_summary(hhi_df):
    print("\n" + "=" * 60)
    print("CONCENTRATION METRICS")
    print("=" * 60)
    print(f"Average HHI: {hhi_df['hhi'].mean():.2f}")
    print(f"Current HHI: {hhi_df['hhi'].iloc[-1]:.2f}")
    print(f"Max providers at any block: {hhi_df['active_providers'].max()}")
    print(f"Current active providers: {hhi_df['active_providers'].iloc[-1]}")

    current_hhi = hhi_df["hhi"].iloc[-1]
    status = get_concentration_status(current_hhi)
    print(f"Pool status: {status}")
    print("=" * 60)


def analyze_pool_liquidity(
    db_path, pair_address, block_start=None, block_end=None, show_plots=True
):
    pair_address = w3.to_checksum_address(pair_address)

    print("Calculating pool liquidity distribution...")
    liquidity_df = calculate_pool_liquidity_pure_sql(
        db_path=db_path,
        pair_address=pair_address,
        block_start=block_start,
        block_end=block_end,
    )

    print(f"Total rows in liquidity data: {len(liquidity_df)}")
    print(
        f"Block range: {liquidity_df['block'].min()} to {liquidity_df['block'].max()}"
    )
    print(f"Number of unique providers: {liquidity_df['provider'].nunique()}")

    if show_plots:
        print("\nGenerating percentage ownership chart...")
        fig_pct = plot_staircase_ownership(liquidity_df)
        fig_pct.show()

        print("Generating absolute liquidity chart...")
        fig_abs = plot_absolute_liquidity_staircase(liquidity_df)
        fig_abs.show()

        print("Generating concentration analysis...")
        fig_conc, concentration_metrics = plot_ownership_concentration(liquidity_df)
        fig_conc.show()
    else:
        _, concentration_metrics = plot_ownership_concentration(liquidity_df)

    print_liquidity_summary(liquidity_df)
    print_concentration_summary(concentration_metrics)

    return liquidity_df, concentration_metrics

In [None]:
liquidity_df, concentration_metrics = analyze_pool_liquidity(
    db_path=DB_PATH,
    pair_address=token_filter[0],
    block_start=START_BLOCK,
    block_end=END_BLOCK,
    show_plots=True
)

In [None]:
def plot_bubble_ownership(df):
    latest_block = df["block"].max()
    latest_data = df[df["block"] == latest_block].copy()
    latest_data = latest_data.sort_values("share_pct", ascending=False)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=latest_data["provider_label"],
            y=[1] * len(latest_data),
            mode="markers+text",
            marker=dict(
                size=latest_data["share_pct"] * 10,
                sizemode="diameter",
                sizemin=20,
                color=latest_data["share_pct"],
                colorscale="Viridis",
                showscale=True,
                colorbar=dict(title="Share (%)", thickness=15, len=0.7),
                line=dict(color="white", width=2),
            ),
            text=latest_data["share_pct"].apply(lambda x: f"{x:.2f}%"),
            textposition="middle center",
            textfont=dict(size=14, color="white", family="Arial Black"),
            hovertemplate=(
                "<b>%{x}</b><br>"
                "Share: %{customdata[0]:.4f}%<br>"
                "Amount: %{customdata[1]:.6f}"
                "<extra></extra>"
            ),
            customdata=latest_data[["share_pct", "cum_provider"]].values,
        )
    )

    fig.update_layout(
        title=f"Pool Ownership at Block {latest_block} (Bubble Size = Share %)",
        xaxis=dict(title="", tickangle=-45, showgrid=False),
        yaxis=dict(visible=False, range=[0.5, 1.5]),
        height=500,
        showlegend=False,
        hovermode="closest",
        plot_bgcolor="rgba(240, 240, 240, 0.5)",
    )

    return fig


def plot_bubble_ownership_2d(df):
    latest_block = df["block"].max()
    latest_data = df[df["block"] == latest_block].copy()
    latest_data = latest_data.sort_values("share_pct", ascending=False).reset_index(
        drop=True
    )

    n_providers = len(latest_data)
    cols = int(np.ceil(np.sqrt(n_providers)))

    latest_data["x_pos"] = latest_data.index % cols
    latest_data["y_pos"] = latest_data.index // cols

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=latest_data["x_pos"],
            y=latest_data["y_pos"],
            mode="markers+text",
            marker=dict(
                size=latest_data["share_pct"] * 15,
                sizemode="diameter",
                sizemin=30,
                color=latest_data["share_pct"],
                colorscale="RdYlGn_r",
                showscale=True,
                colorbar=dict(title="Share (%)", thickness=20, len=0.7),
                line=dict(color="darkgray", width=3),
                opacity=0.8,
            ),
            text=latest_data["provider_label"].str.split("(").str[0].str.strip(),
            textposition="middle center",
            textfont=dict(size=12, color="black", family="Arial Black"),
            hovertemplate=(
                "<b>%{customdata[0]}</b><br>"
                "Share: %{customdata[1]:.4f}%<br>"
                "Amount: %{customdata[2]:.6f}"
                "<extra></extra>"
            ),
            customdata=latest_data[
                ["provider_label", "share_pct", "cum_provider"]
            ].values,
        )
    )

    for idx, row in latest_data.iterrows():
        fig.add_annotation(
            x=row["x_pos"],
            y=row["y_pos"] - 0.15,
            text=f"{row['share_pct']:.2f}%",
            showarrow=False,
            font=dict(size=10, color="white", family="Arial Black"),
            bgcolor="rgba(0,0,0,0.5)",
            borderpad=2,
        )

    fig.update_layout(
        title=f"Pool Ownership Distribution at Block {latest_block}",
        xaxis=dict(visible=False, range=[-0.5, cols - 0.5]),
        yaxis=dict(visible=False, scaleanchor="x", scaleratio=1),
        height=600,
        width=800,
        showlegend=False,
        hovermode="closest",
        plot_bgcolor="white",
    )

    return fig


def analyze_pool_liquidity(
    db_path, pair_address, block_start=None, block_end=None, show_plots=True
):
    pair_address = w3.to_checksum_address(pair_address)

    print("Calculating pool liquidity distribution...")
    liquidity_df = calculate_pool_liquidity_pure_sql(
        db_path=db_path,
        pair_address=pair_address,
        block_start=block_start,
        block_end=block_end,
    )

    print(f"Total rows in liquidity data: {len(liquidity_df)}")
    print(
        f"Block range: {liquidity_df['block'].min()} to {liquidity_df['block'].max()}"
    )
    print(f"Number of unique providers: {liquidity_df['provider'].nunique()}")

    if show_plots:
        print("\nGenerating percentage ownership chart...")
        fig_pct = plot_staircase_ownership(liquidity_df)
        fig_pct.show()

        print("Generating absolute liquidity chart...")
        fig_abs = plot_absolute_liquidity_staircase(liquidity_df)
        fig_abs.show()

        print("Generating concentration analysis...")
        fig_conc, concentration_metrics = plot_ownership_concentration(liquidity_df)
        fig_conc.show()

        print("\nGenerating bubble ownership chart...")
        fig_bubble = plot_bubble_ownership(liquidity_df)
        fig_bubble.show()

        print("\nGenerating 2D bubble ownership chart...")
        fig_bubble_2d = plot_bubble_ownership_2d(liquidity_df)
        fig_bubble_2d.show()
    else:
        _, concentration_metrics = plot_ownership_concentration(liquidity_df)

    print_liquidity_summary(liquidity_df)
    print_concentration_summary(concentration_metrics)

    return liquidity_df, concentration_metrics


liquidity_df, concentration_metrics = analyze_pool_liquidity(
    db_path=DB_PATH,
    pair_address=token_filter[0],
    block_start=START_BLOCK,
    block_end=END_BLOCK,
    show_plots=True,
)