In [1]:
import json
import time
import os
import random
from pathlib import Path
import requests
from requests.exceptions import HTTPError
from web3 import Web3
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures

# === CONFIGURATION ===
INFURA_URL = "https://mainnet.infura.io/v3/3921fc62a7ce4cda98926f47409b3d19"
ETHERSCAN_API_KEY = "F7K9BTHSSB9EQT9WEGHMG3VFJ54KA8RM1K"
CONTRACT_ADDRESS = POOL_ADDRESS = "0xCBCdF9626bC03E24f779434178A73a0B4bad62eD"
ABI_FILE = "WETH_WBTC_pool.json"  # Load your contract ABI file
BLOCKS_FILE = "blocks_data.json"
TRANSACTIONS_FILE = "transactions.json"
METADATA_FILE = "processed_blocks.json"
BATCH_SIZE = 1000  # Number of transactions to process before writing to disk

# === CONNECT TO ETHEREUM NODE ===
w3 = Web3(Web3.HTTPProvider(INFURA_URL))

assert w3.is_connected(), "Web3 provider connection failed"

In [2]:
# --------------------
# Helper Function: Get ABI from Etherscan or Disk
# --------------------
def get_abi(contract_address: str, api_key: str) -> list:
    """
    Retrieves the ABI for a given contract address.
    Checks if the ABI is available in the local 'ABI' folder.
    If not, it fetches the ABI from Etherscan using the provided API key,
    then saves it to disk for future use.
    
    Parameters:
        contract_address (str): The contract address (checksum not required here).
        api_key (str): Your Etherscan API key.
        
    Returns:
        list: The ABI loaded as a Python list.
    """
    # Ensure the ABI folder exists.
    abi_folder = "ABI"
    if not os.path.exists(abi_folder):
        os.makedirs(abi_folder)
    
    # Save ABI with filename based on contract address.
    filename = os.path.join(abi_folder, f"{contract_address}.json")
    
    # If file exists, load and return the ABI.
    if os.path.exists(filename):
        with open(filename, "r") as file:
            abi = json.load(file)
    else:
        # Construct the Etherscan API URL.
        url = f"https://api.etherscan.io/api?module=contract&action=getabi&address={contract_address}&apikey={api_key}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data["status"] == "1":
                # Parse the ABI and save it for later use.
                abi = json.loads(data["result"])
                with open(filename, "w") as file:
                    json.dump(abi, file)
            else:
                raise Exception(f"Error fetching ABI for contract {contract_address}: {data['result']}")
        else:
            raise Exception("Error connecting to the Etherscan API. Status code: " + str(response.status_code))
    return abi
    
# -----------------------
# Helper: Convert event to dict
# -----------------------
def event_to_dict(event):
    d = dict(event)
    if "args" in d:
        d["args"] = dict(d["args"])
    if "transactionHash" in d:
        d["transactionHash"] = d["transactionHash"].hex()
    if "blockHash" in d:
        d["blockHash"] = d["blockHash"].hex()
    return d

# -----------------------
# Metadata Computation from Events File
# -----------------------
def load_metadata_from_events():
    """
    Load metadata directly from EVENTS_FILE.
    Returns a dict with keys as chunk keys (e.g. "0-9999") and values as dicts of event type counts.
    """
    metadata = {}
    try:
        with open(EVENTS_FILE, "r") as f:
            for line in f:
                if line.strip():
                    try:
                        event = json.loads(line)
                        block_number = int(event.get("blockNumber", 0))
                        event_type = event.get("event", "Unknown")
                        chunk_start = (block_number // CHUNK_SIZE) * CHUNK_SIZE
                        chunk_end = chunk_start + CHUNK_SIZE - 1
                        chunk_key = f"{chunk_start}-{chunk_end}"
                        if chunk_key not in metadata:
                            metadata[chunk_key] = {}
                        metadata[chunk_key][event_type] = (
                            metadata[chunk_key].get(event_type, 0) + 1
                        )
                    except Exception as e:
                        print(f"Error processing a line in events file: {e}")
                        continue
    except FileNotFoundError:
        pass
    return metadata


def get_contract_creation_block(contract_address: str, etherscan_api_key: str) -> int:
    """
    Retrieves the contract creation block from Etherscan.
    Returns the block number as an integer.
    """
    url = (f"https://api.etherscan.io/api?module=contract&action=getcontractcreation"
           f"&contractaddresses={contract_address}&apikey={etherscan_api_key}")
    response = requests.get(url)
    data = response.json()

    if data.get("status") == "1":
        results = data.get("result", [])
        if results and len(results) > 0:
            return int(results[0]["blockNumber"])
        else:
            raise Exception("No contract creation data found.")
    else:
        raise Exception("Error fetching creation block: " + data.get("result", "Unknown error"))


# -- Step 2: Reconstruct an Event’s Signature --
def get_event_signature(event_name: str, abi: list) -> str:
    """
    Given an event name and an ABI, find the event definition and reconstruct its signature.
    For example, for event Transfer(address,address,uint256) this returns its keccak256 hash.
    """
    from eth_utils import keccak, encode_hex
    
    for item in abi:
        if item.get("type") == "event" and item.get("name") == event_name:
            # Build the signature string: "Transfer(address,address,uint256)"
            types = ",".join([inp["type"] for inp in item.get("inputs", [])])
            signature = f"{event_name}({types})"
            return encode_hex(keccak(text=signature))
    raise ValueError(f"Event {event_name} not found in ABI.")

    
#token_address = w3.to_checksum_address("0x86c8bF8532AA2601151c9DbbF4e4C4804e042571")
#token_abi = get_abi(token_address, ETHERSCAN_API_KEY)
#token_contract = w3.eth.contract(address=token_address, abi=token_abi)

token_address = w3.to_checksum_address("0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2")
token_abi = get_abi(token_address, ETHERSCAN_API_KEY)
token_contract = w3.eth.contract(address=token_address, abi=token_abi)


In [2]:
# Example: get the Transfer event signature.
transfer_sig = get_event_signature("Transfer", token_abi)
print("Transfer signature hash:", transfer_sig)

# -- Step 3: Determine Token Genesis Block and Set Starting Block --
# Assume you have a helper function get_contract_creation_block() that returns the creation block number.
try:
    genesis_block = get_contract_creation_block(token_address, ETHERSCAN_API_KEY)
    start_block = max(genesis_block - 1, 0)
except Exception as e:
    print("Error retrieving genesis block, defaulting to block 0:", e)
    start_block = 0

# -- Step 4: Fetch Transfer Events and Dump to a File (JSON Serializing) --
def get_transfer_events_paginated(token_contract, from_block: int, to_block: int, chunk_size: int = 5000, max_workers: int = 1) -> list:
    """
    Fetches Transfer events for a token_contract in the block range [from_block, to_block],
    paginating by chunk_size to avoid Infura's result limit. Uses moderate parallelization.

    Args:
        token_contract: A Web3 contract instance with a loaded ABI.
        from_block (int): The starting block number.
        to_block (int): The ending block number.
        chunk_size (int): How many blocks to query per chunk (default 5000).
        max_workers (int): Maximum number of parallel workers (default 4).

    Returns:
        List of events.
    """
    events_collected = []
    block_ranges = []
    
    # Divide the full range into chunks.
    for start_blk in range(from_block, to_block + 1, chunk_size):
        end_blk = min(start_blk + chunk_size - 1, to_block)
        block_ranges.append((start_blk, end_blk))
    
    def fetch_range(brange):
        print(f"Fetching for {brange}")
        start_blk, end_blk = brange
        attempts = 0
        max_retries = 5
        while attempts < max_retries:
            try:
                # Add delay to mitigate rate limits.
                time.sleep(random.uniform(1, 3))
                events = token_contract.events.Transfer.get_logs(from_block=start_blk, to_block=end_blk)
                print(len(events))
                return events
            except Exception as e:
                if "429" in str(e):
                    sleep_time = random.uniform(1, 5)
                    print(f"429 error for blocks {start_blk}-{end_blk}: retrying after {sleep_time:.2f} seconds...")
                    time.sleep(sleep_time)
                    attempts += 1
                else:
                    print(f"Error fetching logs for blocks {start_blk}-{end_blk}: {e}")
                    return []
        return []  # Return empty list if all retries fail.
    
    # Use moderate parallelization.
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_range = {executor.submit(fetch_range, brange): brange for brange in block_ranges}
        for future in concurrent.futures.as_completed(future_to_range):
            events = future.result()
            events_collected.extend(events)
    
    return events_collected
    
# Custom function to convert a web3 event (and its custom types) to a plain dict.
def serialize_event(event):
    # Convert the AttributeDict to a normal dict.
    event_dict = dict(event)
    # Ensure all values are JSON serializable (convert any bytes, HexBytes etc. to a string)
    for key, value in event_dict.items():
        if hasattr(value, "hex"):
            event_dict[key] = value.hex()
    # Also convert inner "args" if present.
    if "args" in event_dict:
        args = dict(event_dict["args"])
        for k, v in args.items():
            if hasattr(v, "hex"):
                args[k] = v.hex()
        event_dict["args"] = args
    return event_dict

# Fetch logs from start_block to the current block (latest)
latest_block = w3.eth.block_number


event_list = get_transfer_events_paginated(token_contract, start_block, latest_block)
# Dump the result to a file (pretty-printing the JSON)
output_filename = f"transfer_events_{contract_address}.json"
with open(output_filename, "w") as f:
    import json
    json.dump(serialized_events, f, indent=4)

print(f"Dumped {len(serialized_events)} events to {output_filename}")


Transfer signature hash: 0xddf252ad1be2c89b69c2b068fc378daa952ba7f163c4a11628f55a4df523b3ef
Fetching for (4719567, 4724566)
0
Fetching for (4724567, 4729566)
0
Fetching for (4729567, 4734566)
0
Fetching for (4734567, 4739566)
0
Fetching for (4739567, 4744566)
0
Fetching for (4744567, 4749566)
0
Fetching for (4749567, 4754566)
32
Fetching for (4754567, 4759566)
573
Fetching for (4759567, 4764566)
829
Fetching for (4764567, 4769566)
480
Fetching for (4769567, 4774566)
516
Fetching for (4774567, 4779566)
1290
Fetching for (4779567, 4784566)
525
Fetching for (4784567, 4789566)
829
Fetching for (4789567, 4794566)
468
Fetching for (4794567, 4799566)
409
Fetching for (4799567, 4804566)
376
Fetching for (4804567, 4809566)
269
Fetching for (4809567, 4814566)
510
Fetching for (4814567, 4819566)
222
Fetching for (4819567, 4824566)
397
Fetching for (4824567, 4829566)
348
Fetching for (4829567, 4834566)
244
Fetching for (4834567, 4839566)
476
Fetching for (4839567, 4844566)
462
Fetching for (484456


KeyboardInterrupt


KeyboardInterrupt

