In [None]:
import os
import gzip
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import json
import dataclasses
import gdown
from pathlib import Path

In [None]:
# Directory containing the sampled communities downloaded from Kaggle. 
# TODO: Change this to your local path.
raw_sampled_communities_dir = f"{os.path.expanduser('~')}/eba_sampled_g/s2s_200k/raw"
raw_sampled_communities_dir = os.path.abspath(raw_sampled_communities_dir)

In [None]:
# Directory containing the crawled WalletExplorer data obtained by running step_1_crawl_walletexplorer_data.ipynb 
# TODO: Change this to your local path.
wallet_explorer_crawled_dir = f"{os.path.expanduser('~')}/eba_graph_studio/walletexplorer/crawled/"
wallet_explorer_crawled_dir = os.path.abspath(wallet_explorer_crawled_dir)
os.makedirs(wallet_explorer_crawled_dir, exist_ok=True)

# ** Note **
# If the above directory does not exist or does not contain the needed data, the following step will download the data first.
wallet_addresses_file_path = os.path.join(wallet_explorer_crawled_dir, "wallet_addresses.tsv.gz")
if not os.path.isfile(wallet_addresses_file_path):
    gdown.download("https://drive.google.com/uc?id=17QuNTq2vBL3U0CyVu40nhCmEGPKT4yC4", wallet_addresses_file_path, quiet=False)

wallet_by_category_file_path = os.path.join(wallet_explorer_crawled_dir, "extended_wallets_by_category.json")
if not os.path.isfile(wallet_by_category_file_path):
    gdown.download("https://drive.google.com/uc?id=1a5_xSsIm6zX_07MpQYQaNq12M504Vo5z", wallet_by_category_file_path, quiet=False)

In [None]:
addresses = {}
with gzip.open(wallet_addresses_file_path, "rt") as f:
    for line in tqdm(f, desc="Reading wallet addresses"):
        address, wallet = line.strip().split("\t")
        addresses[address] = wallet

In [None]:
wallets_categories = json.load(open(wallet_by_category_file_path))

In [None]:
@dataclasses.dataclass
class Wallet:
    label: str
    address_count: str
    category: str

In [None]:
wallets = {}
for category, original_wallets in wallets_categories.items():
    for wallet in original_wallets:
        wallets[wallet["id"]] = Wallet(
            label=wallet["label"],
            address_count=wallet["address_count"],
            category=category
        )

In [None]:
graphs = pd.read_csv(os.path.join(raw_sampled_communities_dir, "metadata.tsv"), sep="\t")
graph_ids = graphs["GraphID"].to_list()

In [None]:
def get_wallet_details(address):
    wallet_id = addresses.get(address)
    
    if wallet_id:
        wallet_info = wallets.get(wallet_id)
        if wallet_info:
            return (wallet_id, wallet_info.label, wallet_info.category)

    return (np.nan, np.nan, np.nan)

In [None]:
for graph_id in tqdm(graph_ids):
    input_filename = os.path.join(raw_sampled_communities_dir, str(graph_id), "BitcoinScriptNode.tsv")
    if not os.path.isfile(input_filename):
        print(f"File not found: {input_filename}, skipping.")
        continue
    
    script_nodes_df = pd.read_csv(input_filename, sep="\t")
    mapped_tuples = script_nodes_df["Address"].map(lambda addr: get_wallet_details(addr))

    script_nodes_df[["WalletExplorer_WalletID", "WalletExplorer_WalletLabel", "WalletExplorer_Category"]] = pd.DataFrame(
        mapped_tuples.tolist(), index=script_nodes_df.index
    )

    # script_nodes_df.fillna("Unknown", inplace=True)

    script_nodes_df.to_csv(os.path.join(raw_sampled_communities_dir, str(graph_id), "BitcoinScriptNode_Annotated.tsv"), sep="\t", index=False)