<a href="https://colab.research.google.com/github/Altemir1/crypto-DL-based-trading-system/blob/main/ETH-onchain-data-collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data collection

Fetching data using AlchemyAPI

In [5]:
!pip install web3

Collecting web3
  Downloading web3-7.8.0-py3-none-any.whl.metadata (5.5 kB)
Collecting eth-abi>=5.0.1 (from web3)
  Downloading eth_abi-5.2.0-py3-none-any.whl.metadata (3.8 kB)
Collecting eth-account>=0.13.1 (from web3)
  Downloading eth_account-0.13.5-py3-none-any.whl.metadata (3.8 kB)
Collecting eth-hash>=0.5.1 (from eth-hash[pycryptodome]>=0.5.1->web3)
  Downloading eth_hash-0.7.1-py3-none-any.whl.metadata (4.2 kB)
Collecting eth-typing>=5.0.0 (from web3)
  Downloading eth_typing-5.2.0-py3-none-any.whl.metadata (3.2 kB)
Collecting eth-utils>=5.0.0 (from web3)
  Downloading eth_utils-5.2.0-py3-none-any.whl.metadata (5.4 kB)
Collecting hexbytes>=1.2.0 (from web3)
  Downloading hexbytes-1.3.0-py3-none-any.whl.metadata (3.3 kB)
Collecting types-requests>=2.0.0 (from web3)
  Downloading types_requests-2.32.0.20250301-py3-none-any.whl.metadata (2.3 kB)
Collecting websockets<14.0.0,>=10.0.0 (from web3)
  Downloading websockets-13.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylin

In [1]:

# List of multiple API keys to rotate
API_KEYS = ['Write your keys here. Multiple keys would be perfect']


In [23]:
import asyncio
import aiohttp
import random
import nest_asyncio
from web3 import Web3
from tqdm import tqdm
import time
import pandas as pd

# Fix event loop issue in Jupyter Notebook
nest_asyncio.apply()

# Function to get a random API key
def get_random_api_key():
    return random.choice(API_KEYS)

# Function to connect to Web3 using a random API key
def get_web3():
    api_key = get_random_api_key()
    provider_url = f"https://eth-mainnet.alchemyapi.io/v2/{api_key}"
    return Web3(Web3.HTTPProvider(provider_url))

# Define block range
START_BLOCK = 8000000  # Estimated start block for 2019
END_BLOCK = get_web3().eth.block_number  # Latest block number
BLOCK_STEP = 100000 # Fetch 1000 blocks at a time instead of 100

# Estimate total batches
total_batches = (END_BLOCK - START_BLOCK) // BLOCK_STEP

# Function to fetch transactions **without async/await**
def fetch_block_transactions(block_number):
    web3 = get_web3()  # Get a new Web3 connection with a random key
    try:
        block = web3.eth.get_block(block_number, full_transactions=True)
        transactions = [
            {
                "timestamp": block.timestamp,
                "from": tx["from"],
                "to": tx["to"],
                "value": tx["value"] / 10**18,  # Convert Wei to ETH
                "gas_price": tx["gasPrice"] / 10**9,  # Convert Wei to Gwei
                "hash": tx["hash"].hex()
            }
            for tx in block.transactions
        ]
        return transactions
    except Exception as e:
        print(f"Error fetching block {block_number}: {e}")
        return []

# Main function to fetch all transactions (running in parallel batches)
def fetch_all_transactions():
    block_count = 0
    all_transactions = []
    with tqdm(total=total_batches, desc="Fetching Blocks", unit="batch") as pbar:
        for block in range(START_BLOCK, END_BLOCK, BLOCK_STEP):
            block_count += 1
            txs = fetch_block_transactions(block)
            all_transactions.extend(txs)
            pbar.update(1)  # Update progress bar

            # Avoid rate limit by sleeping for a short duration
            time.sleep(0.5)


    return all_transactions



In [24]:
all_transactions = fetch_all_transactions()

Fetching Blocks: 140batch [01:41,  1.38batch/s]


In [25]:
df = pd.DataFrame(all_transactions)
df.head()

Unnamed: 0,timestamp,from,to,value,gas_price,hash
0,1561100149,0xBcD44f9795cddD1358dcC3bEF160772FcD607CA4,0x8E71B195D9CC953F46f41aD013bA1147464b621d,3.225223,100.0,ef1ef76dc23bd29ddb64c373897e5b4430766cf08d11f4...
1,1561100149,0xa93b74DA13F39e5E558e2037034A43F0456E7b8B,0x3597bfD533a99c9aa083587B074434E61Eb0A258,0.0,99.0,b4e2d06aff9b0a42662f6b34904aa7229802d230e0c253...
2,1561100149,0x5E032243d507C743b061eF021e2EC7fcc6d3ab89,0xBF37A0D9D0fb49e876cc685afAfD581F8F136306,0.874485,60.0,3027cd8fa04ba932ccbd549cdb3a48e5c3088bf52512e1...
3,1561100149,0x4c8006474754C6D3E14463aa9c863FB66F4ADa22,0x06404399e748CD83F25AB163711F9F4D61cfd0e6,0.0,50.0,458ccc8137dd713d375584015b97b00bca4c60f435fd26...
4,1561100149,0xfeBE9573660c42BC582cFFd1088f3716f4cfc449,0xeF06A35c1928B655AbA17fe637F066CD8EfF9B7e,0.01,41.0,a836106d09c2b7d04d99a9e997bf6d30c5057a2535ff7f...


In [26]:
df.shape

(23860, 6)

In [27]:
df["date"] = df.timestamp.apply(lambda x: time.strftime('%Y-%m-%d', time.localtime(x)))
df.drop(columns=["timestamp"], inplace=True)

df.head()

Unnamed: 0,from,to,value,gas_price,hash,date
0,0xBcD44f9795cddD1358dcC3bEF160772FcD607CA4,0x8E71B195D9CC953F46f41aD013bA1147464b621d,3.225223,100.0,ef1ef76dc23bd29ddb64c373897e5b4430766cf08d11f4...,2019-06-21
1,0xa93b74DA13F39e5E558e2037034A43F0456E7b8B,0x3597bfD533a99c9aa083587B074434E61Eb0A258,0.0,99.0,b4e2d06aff9b0a42662f6b34904aa7229802d230e0c253...,2019-06-21
2,0x5E032243d507C743b061eF021e2EC7fcc6d3ab89,0xBF37A0D9D0fb49e876cc685afAfD581F8F136306,0.874485,60.0,3027cd8fa04ba932ccbd549cdb3a48e5c3088bf52512e1...,2019-06-21
3,0x4c8006474754C6D3E14463aa9c863FB66F4ADa22,0x06404399e748CD83F25AB163711F9F4D61cfd0e6,0.0,50.0,458ccc8137dd713d375584015b97b00bca4c60f435fd26...,2019-06-21
4,0xfeBE9573660c42BC582cFFd1088f3716f4cfc449,0xeF06A35c1928B655AbA17fe637F066CD8EfF9B7e,0.01,41.0,a836106d09c2b7d04d99a9e997bf6d30c5057a2535ff7f...,2019-06-21


In [29]:
df.to_csv("eth-onchain-data.csv")