This notebook crawls WalletExplorer.com to download Bitcoin wallet data, 
such as identified wallets, their labels (if they belong to known entities), 
and their associated addresses.


The process involves two steps: 
first, getting the list of labeled Bitcoin wallets (those with known associated entities), and 
second, getting their associated addresses. 


After contacting the author of WalletExplorer.com, 
he confirmed that only the wallets listed on the landing page have labels.

## This is a long-running task & you do not need to rerun it

This crawling task can take several days to complete, 
depending on your internet connection and the responsiveness of WalletExplorer.com.


However, since WalletExplorer's data is not updated frequently, 
we provide the output of this task; 
hence, you do not need to re-run this step and can use the pre-computed data directly.

Links to download are provided in `step_2_*.ipynb`.

In [None]:
# !pip install -r requirements.txt

In [None]:
import requests
import tqdm
from bs4 import BeautifulSoup
import os
import json
import dataclasses
from tqdm.notebook import tqdm

In [None]:
output_path = "./walletexplorer_data"
os.makedirs(output_path)

In [None]:
# get html content of the landing page
target_url = "https://www.walletexplorer.com"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

response = requests.get(target_url, headers=headers, timeout=10)
response.raise_for_status()
html_content = response.text

In [None]:
# parse the HTML content
wallet_data = {}

soup = BeautifulSoup(html_content, "lxml")
services_table = soup.find("table", class_="serviceslist")
categories = services_table.find_all("td")

for category_td in categories:
    category_header = category_td.find("h3")
    if not category_header:
        continue
    
    category_name = category_header.get_text(strip=True).rstrip(":")
    links = category_td.find_all("a")
    
    wallet_labels = []
    for link in links:
        href = link.get("href")
        if href and href.startswith("/wallet/"):
            wallet_labels.append(href.removeprefix("/wallet/"))

    if wallet_labels:
        wallet_data[category_name] = wallet_labels

In [None]:
with open(os.path.join(output_path, "wallets_by_category.json"), "w") as f:
    json.dump(wallet_data, f, indent=4)

In [None]:
@dataclasses.dataclass
class WalletInfo:
    id: str
    label: str
    address_count: int
    added_address_count: int

In [None]:
def get_wallet_addresses(wallet, output_filename):    
    url = "https://www.walletexplorer.com/api/1/wallet-addresses"
    pagination_from = 0
    pagination_count = 1000
    wallet_info = None
    retrieved_addresses_count = 0
    
    pbar = None
    
    with open(output_filename, "a") as out_f:
        while True:
            params = {"wallet": wallet, "from": pagination_from, "count": pagination_count}

            response = requests.get(url, params=params)
            response.raise_for_status()
            wallet_info_json = response.json()
            
            if pagination_from == 0:
                wallet_info = WalletInfo(
                    id=wallet_info_json.get("wallet_id", ""),
                    label=wallet_info_json.get("label", ""),
                    address_count=wallet_info_json.get("addresses_count", 0),
                    added_address_count=0
                )

                pbar = tqdm(total=wallet_info.address_count, desc=f"Processing wallet: {wallet_info.label}", position=1, leave=False, bar_format="\u2003\u2003\u2003{l_bar}{bar}{r_bar}", colour="orange")

            addresses_info = wallet_info_json.get("addresses", [])
            if len(addresses_info) == 0:
                break
        
            for address_info in addresses_info:
                pbar.update(1)
                address = address_info.get("address")
                retrieved_addresses_count += 1
                if not address:
                    continue
                out_f.write(f"{address}\t{wallet_info.id}\n")
                
            pagination_from += pagination_count
            
        wallet_info.added_address_count = retrieved_addresses_count
        return wallet_info

In [None]:
# get addresses in each wallet

extended_wallet_data = {}
addresses_filename = os.path.join(output_path, "wallet_addresses.tsv")
for category, wallets in wallet_data.items():
    extended_wallet_data[category] = []

    for wallet in tqdm(wallets, desc=f"Processing category: {category}", leave=True, position=0):
        wallet_info = get_wallet_addresses(wallet, addresses_filename)            
        extended_wallet_data[category].append(dataclasses.asdict(wallet_info))

In [None]:
with open(os.path.join(output_path, "extended_wallets_by_category.json"), "w") as f:
    json.dump(extended_wallet_data, f, indent=4)