In [27]:
import pandas as pd
import re
import hashlib
from hashlib import sha256

In [2]:
# Load processed dataset
df = pd.read_csv("../Data_Preprocessing/processed_dataset.csv")

In [3]:
# Address length
df['address_length'] = df['address'].apply(len)

In [4]:
# Address prefix
df['prefix'] = df['address'].apply(lambda x: x[:2])

In [5]:
# Hexadecimal character count (common in Ethereum and Binance Coin addresses)
df['hex_char_count'] = df['address'].apply(lambda x: sum(c in '0123456789abcdefABCDEF' for c in x))

In [8]:
# Numeric character count
df['numeric_char_count'] = df['address'].apply(lambda x: sum(c.isdigit() for c in x))

In [9]:
df.head()

Unnamed: 0,chain,address,address_length,prefix,hex_char_count,numeric_char_count
0,ethereum,0xe26ebb18144cd2d8dcb14ce87fdcfbeb81bacad4,42,0x,41,17
1,bitcoin,19e6aqs6ru2ei5r3cuzcfmcklq78uksmry,34,19,16,9
2,bitcoin,1a8zvt1dumw3ttotpnsrxxlhy4rm7lic4y,34,1a,10,7
3,bitcoin,1p9tsktsfuulgbhx7aff29s7dtnvanoksz,34,1p,13,6
4,bitcoin,1ntacq8uef5p2rbnrkhsksw1f1pdxez3se,34,1n,16,7


In [29]:
digits58 = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'

def decode_base58(bc, length):
    n = 0
    for char in bc:
        n = n * 58 + digits58.index(char)
    return n.to_bytes(length, 'big')
def check_bitcoin(bc):
    try:
        bcbytes = decode_base58(bc, 25)
        return bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4]
    except Exception:
        return False

In [49]:
print(check_bitcoin("1MQdUquQ77DJdKfxLPYSN6vcgph8LkBLZS"))

True


In [30]:
# Bitcoin checksum validation
df['is_valid_bitcoin_checksum'] = df['address'].apply(lambda x: check_bitcoin(x))

In [45]:
df.head()

Unnamed: 0,chain,address,address_length,prefix,hex_char_count,numeric_char_count,is_valid_ethereum_checksum,is_valid_bitcoin_checksum
0,ethereum,0xe26ebb18144cd2d8dcb14ce87fdcfbeb81bacad4,42,0x,41,17,False,False
1,bitcoin,19e6aqs6ru2ei5r3cuzcfmcklq78uksmry,34,19,16,9,False,False
2,bitcoin,1a8zvt1dumw3ttotpnsrxxlhy4rm7lic4y,34,1a,10,7,False,False
3,bitcoin,1p9tsktsfuulgbhx7aff29s7dtnvanoksz,34,1p,13,6,False,False
4,bitcoin,1ntacq8uef5p2rbnrkhsksw1f1pdxez3se,34,1n,16,7,False,False


In [47]:
df_bitcoin_valid = df.loc[(df['chain'] == 'bitcoin') & (df['is_valid_bitcoin_checksum'] == True)]

In [48]:
df_bitcoin_valid

Unnamed: 0,chain,address,address_length,prefix,hex_char_count,numeric_char_count,is_valid_ethereum_checksum,is_valid_bitcoin_checksum
8823146,bitcoin,13h3zhfknqfnvt3g35hpuvs584xjdhr9u9,34,13,14,11,False,True


In [41]:
df[df["chain"] == "bitcoin" lambda x: x == True for x in ]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [11]:
def is_valid_ethereum_checksum(address):
    if address[:2] != '0x' or len(address) != 42:
        return False
    address = address[2:]
    address_hash = hashlib.sha3_256(address.lower().encode('utf-8')).hexdigest()
    for i in range(40):
        if (address[i].isdigit() or address[i].islower()) and int(address_hash[i], 16) >= 8:
            return False
        if address[i].isupper() and int(address_hash[i], 16) < 8:
            return False
    return True

In [12]:
# Ethereum checksum validation
df['is_valid_ethereum_checksum'] = df['address'].apply(lambda x: is_valid_ethereum_checksum(x) if x.startswith('0x') else False)

In [16]:
# Bitcoin checksum validation
df['is_valid_bitcoin_checksum'] = df['address'].apply(lambda x: is_valid_bitcoin_checksum(x) if x[0] in '13' or x.startswith('bc1') else False)

In [43]:
df.head()

Unnamed: 0,chain,address,address_length,prefix,hex_char_count,numeric_char_count,is_valid_ethereum_checksum,is_valid_bitcoin_checksum
0,ethereum,0xe26ebb18144cd2d8dcb14ce87fdcfbeb81bacad4,42,0x,41,17,False,False
1,bitcoin,19e6aqs6ru2ei5r3cuzcfmcklq78uksmry,34,19,16,9,False,False
2,bitcoin,1a8zvt1dumw3ttotpnsrxxlhy4rm7lic4y,34,1a,10,7,False,False
3,bitcoin,1p9tsktsfuulgbhx7aff29s7dtnvanoksz,34,1p,13,6,False,False
4,bitcoin,1ntacq8uef5p2rbnrkhsksw1f1pdxez3se,34,1n,16,7,False,False
