## Setting up the environment for the project


In [None]:
# Librerie necessarie
import requests
from bs4 import BeautifulSoup
import urllib.robotparser
import pandas as pd
import matplotlib.pyplot as plt

import time
import re



## Configuration


In [None]:
BASE_URL = "https://www.walletexplorer.com"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/114.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    }

### Check the robot.txt file


In [None]:

# # Function to check if a URL is allowed by robots.txt
# def is_allowed(url):
#     return rp.can_fetch('*', url)
# # Function to scrape a URL if allowed by robots.txt
# def scrape_url(url):
#     if is_allowed(url):
#         response = requests.get(url)
#         # Process the response
#         print(response.status_code)
#         print(response.text)
#     else:
#         print(f"Scraping blocked by robots.txt: {url}")




# # Parse the robots.txt file 
# rp = urllib.robotparser.RobotFileParser()
# rp.set_url(BASE_URL + '/robots.txt')
# rp.read()

# if not rp.mtime():
#    print("robots.txt could not be read or is not present.")


## Extracting DeepBit.net and DiceOnCrack.com wallet addresses

I create a small pipeline to scrape the walletexplorer website and then extract the wallet addresses from the pages of the two websites by using two functions.


In [None]:
def get_walletexplorer_page():
    ''' Scrape the main page of WalletExplorer to find the search form '''
    try:
        time.sleep(5)  # Pause for 5 seconds to avoid overwhelming the server
        print("Accessing WalletExplorer main page...")
        main_walletexplore_page = requests.get(BASE_URL, headers=HEADERS)
        main_walletexplore_page.raise_for_status()
        print("WalletExplorer has been successfully accessed")
    except requests.exceptions.RequestException as e:
        print("Error while accessing WalletExplorer:", e)
        return None
    print(main_walletexplore_page.status_code)
    return main_walletexplore_page

In [None]:
def get_wallet_address(html_page,service_name):
    
    # Search the form in the page
    soup = BeautifulSoup(html_page.text, 'html.parser')
    search_form = soup.find('form', {'class':'main'})

    action_form = search_form.get('action')

    target_url = BASE_URL + action_form if action_form.startswith('/') else action_form


    # Open search page regarding 'service_name' and open the wallet addresses page
    try: 
        time.sleep(5)  # Pause for 5 seconds to avoid overwhelming the server
        print(f"Accessing the search page for '{service_name}'...")

        search_page = requests.get(target_url, headers=HEADERS, params={'wallet' :service_name})
        search_page.raise_for_status()
        print(f'Search page for "{service_name}" has been successfully accessed')
    except requests.exceptions.RequestException as e:
        print("Error while accessing the search page:", e)
        return None


    ## Scrape the search results and extract the wallet addresses of 'service_name'
    soup = BeautifulSoup(search_page.text, 'html.parser')

    # Find the url of the wallet addresses page
    span = soup.find('span', {'class': 'showother'})

    wallet_link = span.find('a').get('href')
    wallets_url = BASE_URL + wallet_link # create the full URL for the wallet addresses page
    try:
        time.sleep(5)  # Pause for 5 seconds to avoid overwhelming the server        
        wallet_addr_page = requests.get(wallets_url, headers=HEADERS)
        wallet_addr_page.raise_for_status()
        print(f"Wallet addresses page for '{service_name}' has been successfully accessed")
    except requests.exceptions.RequestException as e:
        print(f"Error while accessing the wallets page: {e}")
        return None     

    # Scrape the wallet addresses page extracting the information from the table
    soup = BeautifulSoup(wallet_addr_page.text, 'html.parser')

    # Save the wallet address of 'service_name'
    wallet_addresses = []

    # Find the table containing the wallet addresses
    wallet_table = soup.find('table')

    for row in wallet_table.find_all('tr'):
        col = row.find('td')
        if col and col.find('a', href=True):
            addr = col.find('a')
            wallet_addresses.append(addr.text.strip())
    return wallet_addresses

In [None]:
deepbit_service = "DeepBit.net"
diceoncrack_service = "DiceOnCrack.com"

# Open the main page of WalletExplorer

main_walletexplore_page = get_walletexplorer_page()
if main_walletexplore_page is None:
    print("Failed to retrieve the main WalletExplorer page.")
else:
    # Get the wallet addresses for DeepBit.net
    print(f"Searching for wallet addresses of {deepbit_service}...")
    deepbit_wallet_addresses = get_wallet_address(main_walletexplore_page, deepbit_service)
    if deepbit_wallet_addresses is None:
        print(f"Failed to retrieve wallet addresses for {deepbit_service}.")
    
    # Get the wallet addresses for DiceOnCrack.com
    print(f"Searching for wallet addresses of {diceoncrack_service}...")
    diceoncrack_wallet_addresses = get_wallet_address(main_walletexplore_page, diceoncrack_service)
    if diceoncrack_wallet_addresses is None:
        print(f"Failed to retrieve wallet addresses for {diceoncrack_service}.")
#Print the results
print(f"DeepBit.net wallet addresses: {deepbit_wallet_addresses}")
print(f"DiceOnCrack.com wallet addresses: {diceoncrack_wallet_addresses}")


## Deepbit.net's mining pool analysis


## 1. Deepbit.net's mined block distribution


In [None]:

def map_wallet_addresses(wallet_addresses, mapping_df):
    """
    Mappa una lista di wallet addresses ai rispettivi addressId nel dataset di mapping.
    Restituisce un set di addressId trovati e stampa eventuali indirizzi non trovati.
    """
    mapped_addresses = set()
    for address in wallet_addresses:
        row = mapping_df[mapping_df['hash'] == address]
        if not row.empty:
            mapped_addresses.add(row['addressId'].values[0])
        else:
            print(f"No mapping found for address: {address}")
    print(f"Mapped addresses : {mapped_addresses}")
    return mapped_addresses

In [None]:
#1) Identify the mapping between wallet addresses on the dataset
mapping = pd.read_csv('mapping.csv', engine='pyarrow', header=None)
mapping.columns = ['hash', 'addressId']

if mapping is None:
    print("Failed to retrieve the mapping dataset.")
    exit(1)

deepbit_wallet_addresses = ["1VayNert3x1KzbpzMGt2qdqrAThiRovi8","13NGmRF2SVRg3aKdGNVhXLmhA1JT9p87a8"]

deepbit_mapped_addresses = map_wallet_addresses(deepbit_wallet_addresses, mapping)


In [None]:
#2) Find the transaction patterns for the wallet addresses

# Load datasets
transactions = pd.read_csv('transactions.csv', engine='pyarrow')
transactions.columns = ['timestamp', 'blockId', 'txId', 'isCoinbase', 'fee']

outputs = pd.read_csv('outputs.csv', engine='pyarrow')
outputs.columns = ['txId', 'position', 'addressId', 'amount', 'scripttype']

inputs = pd.read_csv('inputs.csv', engine='pyarrow')
inputs.columns = ['txId', 'prevTxId', 'prevTxpos']

# 1. Identify Deepbit addresses (assuming deepbit_mapped_addresses is predefined)
# deepbit_mapped_addresses = [...] 

# 2. Find all transactions that have AT LEAST ONE output to a Deepbit address
deepbit_txs = outputs.loc[outputs['addressId'].isin(deepbit_mapped_addresses), 'txId'].unique()

# 3. For these transactions, find INPUTS originating from Coinbase transactions
# 3a. Retrieve all inputs of Deepbit transactions
inputs_deepbit = inputs[inputs['txId'].isin(deepbit_txs)]

# 3b. Filter only inputs coming from Coinbase transactions
coinbase_txids = transactions[transactions['isCoinbase'] == 1]['txId']
inputs_deepbit = inputs_deepbit[inputs_deepbit['prevTxId'].isin(coinbase_txids)]

# 4. Verify that Deepbit transactions spend EXCLUSIVELY Coinbase outputs
# 4a. Count all inputs of Deepbit transactions
all_inputs_counts = inputs.groupby('txId').size().loc[deepbit_txs].rename('total_inputs').fillna(0)

# 4b. Count Coinbase inputs (already filtered in step 3)
coinbase_inputs_counts = inputs_deepbit.groupby('txId').size().rename('coinbase_inputs')

# 4c. Select only transactions where all inputs are from Coinbase
valid_spend = all_inputs_counts.index[
    all_inputs_counts == coinbase_inputs_counts.reindex(all_inputs_counts.index, fill_value=0)
]

# 5. Build final result
result = (
    inputs_deepbit[inputs_deepbit['txId'].isin(valid_spend)]
    .rename(columns={'prevTxId': 'coinbaseTx', 'txId': 'deepbitSpendTx'})
    [['coinbaseTx', 'deepbitSpendTx']]
    .drop_duplicates()
)

print("Number of identified Deepbit transactions:", len(result))
result.head(20)

In [None]:
# Lets prepare the data for Deepbit.net block distribution

deepbit_mined_blocks_df = transactions[transactions['txId'].isin(result['coinbaseTx'])].copy()

# Assumendo che 'deepbit_mined_blocks_df' abbia una colonna 'timestamp' già convertita in datetime
deepbit_mined_blocks_df['timestamp'] = pd.to_datetime(deepbit_mined_blocks_df['timestamp'], unit='s')
deepbit_mined_blocks_df.set_index('timestamp', inplace=True)

# 1. Conteggio giornaliero
daily = deepbit_mined_blocks_df.resample('D').size()

# 2. Conteggio settimanale
weekly = deepbit_mined_blocks_df.resample('W').size()

# 3. Conteggio mensile
monthly = deepbit_mined_blocks_df.resample('ME').size()

# Visualizzazione

fig, axs = plt.subplots(3, 1, figsize=(14, 12), sharex=False)
fig.suptitle('Distribuzione dei blocchi minati da Deepbit.net', fontsize=16)

axs[0].plot(daily.index, daily.values, label='Giornaliero', color='blue')
axs[0].set_title('Blocchi minati - Giornaliero')
axs[0].set_ylabel('Blocchi')
axs[0].grid(True, alpha=0.3)

axs[1].plot(weekly.index, weekly.values, label='Settimanale', color='green')
axs[1].set_title('Blocchi minati - Settimanale')
axs[1].set_ylabel('Blocchi')
axs[1].grid(True, alpha=0.3)

axs[2].plot(monthly.index, monthly.values, label='Mensile', color='orange')
axs[2].set_title('Blocchi minati - Mensile')
axs[2].set_ylabel('Blocchi')
axs[2].set_xlabel('Data')
axs[2].grid(True, alpha=0.3)

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()


In [None]:
# Supponiamo che 'result' contenga le coinbaseTx di Deepbit.net (come già ottenuto nel tuo notebook)
# E che 'transactions' sia il DataFrame delle transazioni con le colonne ['timestamp', 'blockId', 'txId', 'isCoinbase', 'fee']

#Lets prepare the data for Deepbit.net fee distribution
# 1. Estrai le transazioni coinbase di Deepbit.net
deepbit_coinbase_txs = transactions[transactions['txId'].isin(result['coinbaseTx'])].copy()
print(f"Number of Deepbit.net coinbase transactions: {deepbit_coinbase_txs.columns}")
# 2. Converte il timestamp in datetime
deepbit_coinbase_txs['datetime'] = pd.to_datetime(deepbit_coinbase_txs['timestamp'], unit='s')
print(deepbit_coinbase_txs.head(20))

a = deepbit_coinbase_txs[ deepbit_coinbase_txs['fee'] > 0 ]

print(f"Number of Deepbit.net coinbase transactions with fee > 0: {len(a)}")

total_fee = deepbit_coinbase_txs['fee'].sum()
print(f"Total fee collected by Deepbit.net: {total_fee} satoshis")

fee_per_block = deepbit_coinbase_txs.groupby('blockId')['fee'].sum()

fee_per_block.head(20)


In [None]:
## TODO DA testare


def calculate_deepbit_utxo_monthly(transactions, inputs, outputs, deepbit_address_ids):
    """Calculates monthly UTXO for Deepbit addresses"""
    
    # Find all Deepbit-related transactions
    deepbit_outputs = outputs[outputs['addressId'].isin(deepbit_address_ids)].copy()
    deepbit_inputs = inputs.merge(
        outputs[['txId', 'position', 'addressId']],
        left_on=['prevTxId', 'prevTxpos'],
        right_on=['txId', 'position'],
        how='inner',
        suffixes=('', '_prev')
    )
    deepbit_inputs = deepbit_inputs[deepbit_inputs['addressId'].isin(deepbit_address_ids)]
    
    # Add timestamps
    deepbit_outputs = deepbit_outputs.merge(
        transactions[['txId', 'timestamp']], 
        on='txId'
    )
    deepbit_inputs = deepbit_inputs.merge(
        transactions[['txId', 'timestamp']], 
        left_on='txId', 
        right_on='txId'
    )
    
    # Convert timestamps to datetime
    deepbit_outputs['datetime'] = pd.to_datetime(deepbit_outputs['timestamp'], unit='s')
    deepbit_inputs['datetime'] = pd.to_datetime(deepbit_inputs['timestamp'], unit='s')
    
    # Calculate monthly UTXO
    all_dates = pd.date_range(
        start=min(deepbit_outputs['datetime'].min(), deepbit_inputs['datetime'].min()),
        end=max(deepbit_outputs['datetime'].max(), deepbit_inputs['datetime'].max()),
        freq='MS'
    )
    
    monthly_utxo = []
    for month_start in all_dates:
        month_end = month_start + pd.offsets.MonthEnd()
        
        # UTXO = outputs before month end minus inputs before month end
        outputs_before = deepbit_outputs[deepbit_outputs['datetime'] <= month_end]
        inputs_before = deepbit_inputs[deepbit_inputs['datetime'] <= month_end]
        
        spent_outputs = set(zip(inputs_before['prevTxId'], inputs_before['prevTxpos']))
        unspent_outputs = outputs_before[
            ~outputs_before.apply(lambda x: (x['txId'], x['position']) in spent_outputs, axis=1)
        ]
        
        utxo_amount = unspent_outputs['amount'].sum() / 100000000  # Convert to BTC
        monthly_utxo.append({
            'month': month_start,
            'utxo_btc': utxo_amount,
            'utxo_count': len(unspent_outputs)
        })
    
    utxo_df = pd.DataFrame(monthly_utxo)
    
    # Visualization
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # UTXO value in BTC
    ax1.plot(utxo_df['month'], utxo_df['utxo_btc'], marker='o', linewidth=2, markersize=6)
    ax1.set_title('Deepbit.net UTXO Over Time (BTC Value)')
    ax1.set_xlabel('Month')
    ax1.set_ylabel('UTXO (BTC)')
    ax1.grid(True, alpha=0.3)
    ax1.tick_params(axis='x', rotation=45)
    
    # UTXO count
    ax2.plot(utxo_df['month'], utxo_df['utxo_count'], marker='s', linewidth=2, markersize=6, color='orange')
    ax2.set_title('Deepbit.net UTXO Count Over Time')
    ax2.set_xlabel('Month')
    ax2.set_ylabel('UTXO Count')
    ax2.grid(True, alpha=0.3)
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print("\n=== DEEPBIT UTXO STATISTICS ===")
    print(f"Maximum UTXO: {utxo_df['utxo_btc'].max():.6f} BTC")
    print(f"Minimum UTXO: {utxo_df['utxo_btc'].min():.6f} BTC")
    print(f"Average UTXO: {utxo_df['utxo_btc'].mean():.6f} BTC")
    print(f"Final UTXO: {utxo_df['utxo_btc'].iloc[-1]:.6f} BTC")
    
    return utxo_df

# Main function to run all analyses
def run_deepbit_analysis(deepbit_wallet_addresses):
    """Runs all required analyses for Deepbit.net"""
    
    print("=== STARTING DEEPBIT.NET ANALYSIS ===\n")
    
    # 1. Load dataset
    transactions = pd.read_csv('transactions.csv', engine='pyarrow')
    transactions.columns = ['timestamp', 'blockId', 'txId', 'isCoinbase', 'fee']
    
    outputs = pd.read_csv('outputs.csv', engine='pyarrow')
    outputs.columns = ['txId', 'position', 'addressId', 'amount', 'scripttype']
    
    inputs = pd.read_csv('inputs.csv', engine='pyarrow')
    inputs.columns = ['txId', 'prevTxId', 'prevTxpos']
    
    # 2. Get Deepbit address IDs
    # Assuming you have a mapping from addresses to IDs
    deepbit_address_ids = deepbit_mapped_addresses
    
    # 3. Identify Deepbit Coinbase transactions
    # (Use your previously defined function here)
    deepbit_coinbases_df = result
    
    # Placeholder for analysis results
    analysis_results = {}
    
    # 4. Analyze block distribution
    if not deepbit_coinbases_df.empty:
        print("\n1. BLOCK DISTRIBUTION ANALYSIS")
        print("=" * 40)
        analysis_results['block_distribution'] = analyze_deepbit_block_distribution(deepbit_coinbases_df)
    
    # 5. Analyze fees
    print("\n2. FEE ANALYSIS")
    print("=" * 40)
    analysis_results['fee_analysis'] = calculate_deepbit_fees(deepbit_coinbases_df, transactions)
    
    # 6. Calculate monthly UTXO
    print("\n3. MONTHLY UTXO ANALYSIS")
    print("=" * 40)
    analysis_results['utxo_analysis'] = calculate_deepbit_utxo_monthly(transactions, inputs, outputs, deepbit_address_ids)
    
    print("\n=== DEEPBIT.NET ANALYSIS COMPLETED ===")
    
    return {
        'coinbase_transactions': deepbit_coinbases_df,
        'analyses': analysis_results
    }

## DiceOnCrack.com gambling service analysis


In [None]:

# if diceoncrack_wallet_addresses is None:
    #in case of error in the scraping process, we can use a predefined list of wallet addresses
diceoncrack_wallet_addresses = pd.Series([
    "12TaAbLWBNKB1NLYH92CPnC1DizQoNK6FN",
    "1CRACkbiJSxfDaLNEoaNsHjNtU4KttwHyo",
    "1CRACKafkXsQzUYmu2fUM3j9c2y4yDhvfh",
    "1CRACKLiwFrZbAQz1yb9w22onHCMLbiMTY",
    "12tAabLFLxvUzC5KuX7VKMM8bYRncbQ84E",
    "1CrAcKt3HE8LNsx4KKDvjqLvcr373wg5ke",
    "1AVFypuG2jUrYzjZa69C7hK59XkWUwvK1m",
    "1CRACK25QvpVdcEmPZVD5ixtf99cMF9stg",
    "1CracksLRtQMcTF4HXNrvPzRgvz7Qr6wNd",
    "13TAabLHjNzwg8Mj7XYn76FuVAqj32s8EM",
    "1CrAckQppdcfiiw4XzpsKrZrf9eDvUok9C",
    "19TAABLQTLxgWHTdm7yNJNstgeQFgxTP4f",
    "14TAAbLiw2QLuRJCGQ3iETYg3vcpweZkTE",
    "15TaABLmhxiRQ9DTX6ZcZ9S9RknVZmP5jX",
    "1tAabLBcZLVL7md9nAnvGMCYdbvq4UVZV",
    "1PipEaL8yRS8n93mUS16wT5SNDiMrMutv5",
    "1PipemCUjxq9LKww7CaLWUMeGVZL3bD3VM",
    "1LQXotaEjfmerkwrGB3dHnheujo7sng6vA",
    "1PipeBMryPGnN3Ms3HfnNjetCS4THmkpkS",
    "1PipeZHgQXcjAYsUQ4WRXyKZn1X3sJNrpk",
    "1PipePezjvE7vBukPyDUkhHEF54qK1nkeu",
    "1Q44t4knYY3PsQZUFAejhd7Wot79ecHe8e",
    "1F4VXTQRzVQfLaGEWcf697xj1g2cKqPire",
    "1Pipeb5iNYmURifrxPZfvwHsTiw9rEb2iu",
    "1PipeZofhJv1hxsxCadEeG1vHAK87f23LE",
    "17ZmFwCULT44K25kWDeYbHiGaJCrWtytjx",
    "13encD1Yagh8M6a9Wgb3YJxKHrHqXnYi8y",
    "1GD2EiVa1rbbXcmFceyM47YN16fzVwn9j"
],name='hash')


diceoncrack_mapped_addresses = map_wallet_addresses(diceoncrack_wallet_addresses, mapping)



### 1) Find the transaction of DiceOnCrack.com which are the transactions that have at least one input or output address of DiceOnCrack.com


In [None]:
def find_diceoncrack_transactions(diceoncrack_mapped_addresses, transactions, inputs, outputs):
    """Finds all transactions related to DiceOnCrack.com addresses."""
    # Find all outputs that are related to DiceOnCrack addresses
    diceoncrack_output_transactions = outputs[outputs['addressId'].isin(diceoncrack_mapped_addresses)]

    # Find all inputs information (address, amount, ...)
    renamed_outputs= outputs.rename(columns={'txId': 'prevTxId', 'position': 'prevTxpos'})

    inputs_info = inputs.merge(renamed_outputs,on=['prevTxId', 'prevTxpos'])

    # Find all inputs that are related to DiceOnCrack addresses
    diceoncrack_inputs_transactions = inputs_info[inputs_info['addressId'].isin(diceoncrack_mapped_addresses)]

    # DEBUG
    # print(diceoncrack_output_transactions.head(5))
    # print("---")
    # print(diceoncrack_inputs_transactions.head(5))


    # Find all transactions that have at least one output address of DiceOnCrack.com
    df_union = pd.concat([diceoncrack_output_transactions['txId'], diceoncrack_inputs_transactions['txId']],axis=0, ignore_index=True).drop_duplicates()

    diceoncrack_transactions = transactions[transactions['txId'].isin(df_union)].drop_duplicates()

    # Order by blockId in ascending order
    diceoncrack_transactions = diceoncrack_transactions.sort_values(by='blockId', ascending=True)
    return diceoncrack_transactions

# Find the transactions related to DiceOnCrack.com
diceoncrack_transactions = find_diceoncrack_transactions(diceoncrack_mapped_addresses, transactions, inputs, outputs)
print(f"Number of transactions related to DiceOnCrack.com: {len(diceoncrack_transactions)}")
# DEBUG
# print(diceoncrack_transactions.head(20))
# print(diceoncrack_transactions.describe())


### 2) Group transactions by block height


In [None]:
def group_diceoncrack_transactions_by_block(diceoncrack_transactions):
    """Groups DiceOnCrack.com transactions by blockId."""
    
    #find the group of transactions that have the same blockId
    grouped_transactions = diceoncrack_transactions.groupby('blockId')
    return grouped_transactions


In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

# Cache globale per memorizzare address -> wallet
address_to_wallet_cache = {}
REQUEST_INTERVAL = 6  # secondi tra le richieste

def get_wallet_hash(address):
    """Effettua scraping su Wallet Explorer per ottenere l'hash del wallet dato un address"""

    
    url = f"https://www.walletexplorer.com/?q={address}"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
    try:
        time.sleep(REQUEST_INTERVAL)  # Attendi tra le richieste per evitare blocchi
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Tentativo 1: Cerca nel div header
        main_div = soup.find('div',id='main')
        h2_tag = main_div.find('h2') if main_div else None
        
        if h2_tag:
            # Prendi il testo completo
            full = h2_tag.get_text(separator=' ').strip()
            # Estrai tutte le sottostringhe fra virgolette doppie
            quoted = full.split(' ')
            # Pulisci gli elementi e verifica che ci sia almeno un secondo elemento
            cleaned = [s.strip() for s in quoted]
            if len(cleaned) >= 2:
                wallet_hash = cleaned[2]
                if wallet_hash[0] == '[' and wallet_hash[-1] == ']':
                    # Rimuovi le parentesi quadre
                    wallet_hash = wallet_hash[1:-1]
                # cache e rate‑limit
                print(f"Wallet hash trovato per {address}: {wallet_hash}")
                address_to_wallet_cache[address] = wallet_hash
                # get_wallet_hash.last_request_time = current_time
                return wallet_hash
        
        # Se non trovato
        address_to_wallet_cache[address] = None
        return None
        
    except Exception as e:
        print(f"Errore durante lo scraping per {address}: {str(e)}")
        address_to_wallet_cache[address] = None
        return None
    # finally:
        # get_wallet_hash.last_request_time = time.time()

def cluster_by_wallet(group_info, diceoncrack_wallet_ids):
    """
    Crea cluster di transazioni omogenee per wallet di input
    utilizzando operazioni vettoriali di pandas
    """

    grouped_infor_by_txid = group_info.groupby('txId')

    cluster_info =[]

    
    ranged_renamed_outputs = outputs.merge(group_info, on='txId').rename(columns={'txId': 'prevTxId', 'position': 'prevTxpos'})
    tmp = inputs.merge(ranged_renamed_outputs, on=['prevTxId', 'prevTxpos'])
    

    for tx_id, group in grouped_infor_by_txid:

        inputs_mapped_address= tmp[tmp['prevTxId'] == tx_id]['addressId']

        # Exclude the cluster if any address belongs to DiceOnCrack
        inputs_mapped_address = inputs_mapped_address[inputs_mapped_address.isin(diceoncrack_wallet_ids) == False]
        if inputs_mapped_address.empty:
            print(f"Skipping txId {tx_id} due to no valid addresses.")
            continue

        input_address = mapping.merge(inputs_mapped_address, on='addressId')['hash']


        cluster_wallet = get_wallet_hash(input_address.iloc[0])  # Prendi il wallet hash del primo address
        for addr in input_address:
            try:
                wallet_hash = get_wallet_hash(addr)
                if wallet_hash != cluster_wallet:
                    print(f"Cluster skipped for txId {tx_id} due to different wallet hashes.")
                    cluster_wallet = None
                    break
            except Exception as e:
                print(f"Error while getting wallet hash for address {addr}: {e}")
                break
        
        if cluster_wallet is None:
            # print(f"Skipping cluster for txId {tx_id} due to inconsistent wallet hashes.")
            continue
        else:
            # print(f"Cluster for txId {tx_id} belongs to wallet: {cluster_wallet}")
            # save the cluster information
            cluster_info.append({
                'txId': tx_id,
                'wallet': cluster_wallet,
                'num_addresses': len(input_address)})

    # Convert the list of dictionaries to a DataFrame
    cluster_wallets = pd.DataFrame(cluster_info, columns=['txId', 'wallet'])
    print(f"Number of clusters found: {len(cluster_wallets)}")
    print(cluster_wallets.head(20))
    return cluster_wallets


In [38]:
# Set the period : example starting from 26/12/2012

diceoncrack_transactions['timestamp'] = pd.to_datetime(diceoncrack_transactions['timestamp'], unit='s')

starting_period = pd.to_datetime('2012-12-26')

diceoncrack_transactions = diceoncrack_transactions[diceoncrack_transactions['timestamp'] >= starting_period]

print(f"Number of transactions after filtering by date: {len(diceoncrack_transactions)}")
# print(diceoncrack_transactions.head(5))

groups = group_diceoncrack_transactions_by_block(diceoncrack_transactions)
print(f"Number of blocks with DiceOnCrack.com transactions: {len(groups)}")
# # DEBUG
# print(groups.head(20))
# print(groups.head(5))

# Esempio di utilizzo
all_clusters = pd.DataFrame(columns=['blockId', 'txId', 'wallet'])

for block_id, group in groups:
    tx_ids = group['txId']
    
    # Ottimizzazione: calcola tutto in vettori
    clusters = cluster_by_wallet(
        group_info=group,
        diceoncrack_wallet_ids=diceoncrack_mapped_addresses
    )
    # Add cluster information to the DataFrame
    all_clusters = pd.concat([all_clusters, clusters], ignore_index=True, axis=0)


print(all_clusters.head(20))

Number of transactions after filtering by date: 6
Number of blocks with DiceOnCrack.com transactions: 3


KeyboardInterrupt: 