In [1]:
import logging
# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Specify the log message format
    datefmt='%Y-%m-%d %H:%M:%S',  # Specify the date format
    handlers=[
        #logging.FileHandler('app.log'),  # Log messages to a file
        logging.StreamHandler()  # Also output log messages to the console
    ]
)
logger = logging.getLogger(__name__)
import pickle 
import networkx as nx
import time
from datetime import datetime, timezone
import os
import sys
import pandas as pd

In [2]:
"""The data stored in pickle format with version: 0.7.5 (python 3.7).
The type of graph object：networkx.classes.multidigraph.MultiDiGraph
Numbers of nodes: 2973489
Numbers of edges: 13551303
Average degree:   4.5574
Nodes' features：
    // The label. 1 means fishing mark node, otherwise 0.
    G.nodes[nodeName]['isp']；

Edges' features:
    G[node1][node2][0]['amount']        // The amount mount of the transaction.
    G[node1][node2][0]['timestamp']     // The timestamp of the transaction.				
							
* Notes * 
"""



def load_pickle(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)
logger.info("Loading the graph")
start = time.time()
G = load_pickle('/mnt/data/ethereum-phishing-transaction-network/MulDiGraph.pkl')
logger.info(f"Graph loaded in {time.time()-start} seconds.")
logger.info(f'Number of nodes: {G.number_of_nodes()}')
logger.info(f'Number of edges: {G.number_of_edges()}')


# save nodes and their labels to a csv
logger.info("Saving nodes and their labels to a csv")
start = time.time()
nodes = []
labels = []
for node in G.nodes():
    nodes.append(node)
    labels.append(G.nodes[node]['isp'])
dfnodes = pd.DataFrame({'node': nodes, 'label': labels})

2024-08-27 11:19:51 - INFO - Loading the graph
2024-08-27 11:20:12 - INFO - Graph loaded in 20.831660270690918 seconds.
2024-08-27 11:20:12 - INFO - Number of nodes: 2973489
2024-08-27 11:20:16 - INFO - Number of edges: 13551303
2024-08-27 11:20:16 - INFO - Saving nodes and their labels to a csv


In [39]:
logger.info("Processing the graph")
uniq = set()
start = time.time()
min_time = 1e20
max_time = 0
from_address = []
to_address = []
time_stamps = []
amounts = []
max_deg = 0
max_deg_address = ''
num_larger_10K = 0

for node in G.nodes():
    edges = G.edges(node, data=True)
    if len(edges) > max_deg:
        max_deg = len(edges)
        max_deg_address = node
    if len(edges) > 10000:
        num_larger_10K += 1
    for edge in edges:
        u = node
        v = edge[1]
        tim = edge[2]['timestamp']
        amo = edge[2]['amount']
        uniq.add((u, v, amo, tim))
        min_time = min(min_time, tim)
        max_time = max(max_time, tim)

print(f"Max degree: {max_deg}")
print(f"Address with max degree: {max_deg_address}")
print(f"Number of addresses with degree larger than 10K: {num_larger_10K}")
min_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(min_time))
max_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(max_time))
# print(min_time, max_time)
# print(len(uniq))
logger.info(f"Graph processed in {time.time()-start} seconds.")
logger.info(f'Number of unique transactions: {len(uniq)}')
#print an element from uniq
logger.info(f"An element from uniq: {list(uniq)[0]}")
logger.info(f'Minimum timestamp: {min_time}')
logger.info(f'Maximum timestamp: {max_time}')

2024-08-27 12:44:52 - INFO - Processing the graph
2024-08-27 12:45:24 - INFO - Graph processed in 28.486971378326416 seconds.
2024-08-27 12:45:24 - INFO - Number of unique transactions: 12060024
2024-08-27 12:45:25 - INFO - An element from uniq: ('0x0e9ea9519226094daa42cd4aac2678941b5ccd99', '0x491c9a23db85623eed455a8efdd6aba9b911c5df', 0.0, 1526094538.0)


Max degree: 102480
Address with max degree: 0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98
Number of addresses with degree larger than 10K: 70


2024-08-27 12:45:25 - INFO - Minimum timestamp: 2015-08-07 07:01:09
2024-08-27 12:45:25 - INFO - Maximum timestamp: 2019-01-19 10:24:22


In [16]:
logger.info(f"An element from uniq: {list(uniq)[:10]}")

2024-08-27 11:36:41 - INFO - An element from uniq: [('0x0e9ea9519226094daa42cd4aac2678941b5ccd99', '0x491c9a23db85623eed455a8efdd6aba9b911c5df', 0.0, 1526094538.0), ('0x91a921281bd4f5578988b9b80e38a881cc945f1e', '0x19fffd124cd9089e21026d10da97f8cd6b442bff', 0.0, 1532772862.0), ('0xc60abc229fb94569d483c9248e3b8be016c8432a', '0x049399a6b048d52971f7d122ae21a1532722285f', 0.0, 1524594093.0), ('0x2649139637cce189653ec1e95486b797efdfec50', '0xd8d48e52f39ab2d169c8b562c53589e6c71ac4d3', 0.0, 1529129293.0), ('0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98', '0xf487d016b2ea5bebdeea03fa12adcab51237bc25', 1.0, 1522175188.0), ('0x3926f43ad0fbda08175e9a314d3e972e4fe34416', '0x667118a44f8e4f3e144fe70bdfe6f54851781c98', 0.001, 1529405922.0), ('0xba111af1e365510b71c6460b3b833447f73e759d', '0xea38eaa3c86c8f9b751533ba2e562deb9acded40', 0.0, 1509399147.0), ('0x1e150ffc8953b0cf3b0d883b7ad0e41710c85087', '0xe21bc8d4f47dbd9e4dd29229333d1999c33d0e87', 2.999412, 1520424786.0), ('0x31da1575cb933d776b22295756369b103

In [17]:
import os
transactions_path = '/mnt/data/ethereum-phishing-transaction-network/transactions-c.csv'
if os.path.exists(transactions_path):
    columns_to_read = ['from_address', 'to_address', 'value', 'block_timestamp']
    dtype_dict = {
        'from_address': 'str',
        'to_address': 'str',
        'value': 'str',
        'block_timestamp': 'str'  # assuming block_timestamp is initially read as string
    }
    df = pd.read_csv(transactions_path, usecols=columns_to_read, dtype=dtype_dict) 
    print(f"Read {len(df)} transactions from the csv")
    #df['value'] = df['value'].astype(float)
    df['block_timestamp'] = pd.to_datetime(df['block_timestamp']).view('int64') / 10**9
    df['value'] = df['value'].astype(float) / 1e18
    # print a row of the dataframe
    print(f"A row from the dataframe: {df.iloc[0]}")
    exists = set(zip(df['from_address'], df['to_address'], df['value'], df['block_timestamp']))
    #exists = set(zip(df['from_address'], df['to_address'], df['block_timestamp']))
    diff = uniq - exists

Read 5256894 transactions from the csv


  df['block_timestamp'] = pd.to_datetime(df['block_timestamp']).view('int64') / 10**9


A row from the dataframe: from_address       0xc3d9c17d7f6988c0fe7ebe929c47efccbd92be13
to_address         0x0d64b06929f797d641064521705714498618fe1e
value                                                    5.55
block_timestamp                                  1494731739.0
Name: 0, dtype: object


In [18]:
# compare the transactions in the graph with the transactions in the csv
print("Number of transactions in the csv: ", len(exists))
print("Number of transactions in the graph but not in the csv: ", len(diff))
print("Number of transactions in the csv but not in the graph: ", len(exists - uniq))

Number of transactions in the csv:  5245263
Number of transactions in the graph but not in the csv:  6876384
Number of transactions in the csv but not in the graph:  61623


In [None]:
# compare the nodes in the graph with the from_address and to_address in the csv
nodes = set(G.nodes())
from_addresses = set(df['from_address'])
to_addresses = set(df['to_address'])
csv_nodes = from_addresses.union(to_addresses)
print(f"Number of nodes in the graph: {len(nodes)}")
print(f"Number of nodes in the csv: {len(csv_nodes)}")
print(f"Number of nodes in the graph but not in the csv: {len(nodes - csv_nodes)}")
print(f"Number of nodes in the csv but not in the graph: {len(csv_nodes - nodes)}")
# what is the percentage of labels of the nodes in the graph but not in the csv
labels = [G.nodes[node]['isp'] for node in (nodes - csv_nodes)]
print(f"Percentage of labels in the graph but not in the csv: {sum(labels)/len(labels)}")
print(f"Ilicit missing nodes {sum(labels)}")


In [None]:
transactions_path = '/mnt/data/ethereum-phishing-transaction-network/transactions.csv'
names = [
    'nonce',
    'from_address',
    'to_address',
    'transaction_index',
    'value',
    'gas',
    'gas_price',
    #'receipt_status',
    'block_timestamp',
    # 'phishing',
]
dtypes = {
    'nonce': 'float64',
    'from_address': 'str',
    'to_address': 'str',
    'transaction_index': 'category',
    'value': 'float64',
    'gas': 'float64',
    'gas_price': 'float64',
    #'receipt_status': 'category',
}
import numpy as np
# Define converter function
def to_float_or_nan(value):
    try:
        return float(value)
    except ValueError:
        return np.nan
df = pd.read_csv(transactions_path, usecols=names, dtype=dtypes, parse_dates=['block_timestamp'], converters={'receipt_gas_used': to_float_or_nan})
# print statistics for each column of the dataframe
for col in df.columns:
    print(f"Statistics for column {col}")
    print(df[col].describe())
    # print the number of nans in the column
    print(f"Number of nans in column {col}: {df[col].isna().sum()}")
    print()

In [None]:
df['gas_price'].hist(bins=100)

In [None]:
np.log1p(df['gas_price']).hist(bins=100)

In [None]:
import numpy as np
# renumber from_address and to_address starting from 0 and update the dataframe
unique_ids = pd.concat([df['from_address'], df['to_address']]).unique()
# create a mapping from the original ID to a new ID
id_map = {old_id: new_id for new_id, old_id in enumerate(unique_ids)}
# replace the original IDs with the new IDs
df['from_address'] = df['from_address'].apply(lambda x: id_map[x])
df['to_address'] = df['to_address'].apply(lambda x: id_map[x])
# convert timestamp to UNIX
df['block_timestamp'] = pd.to_datetime(df['block_timestamp']).astype('int64') / 10**9
#remove nodes that are not in the transactions
dfnodes = dfnodes[dfnodes['node'].isin(unique_ids)]
dfnodes['node'] = dfnodes['node'].apply(lambda x: id_map[x])

# normalize numerical columns
for col in df.columns:
    print(col)
    if col in ['from_address', 'to_address', 'phishing', 'receipt_status', 'block_timestamp', 'transaction_index']:
        continue
    df[col] = np.log1p(df[col])
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

In [None]:
print(unique_ids[:10])

In [None]:
print(dfnodes.head())
len(dfnodes)

In [None]:
# save the dataframe to a csv file
# if not os.path.exists('/mnt/data/ethereum-phishing-transaction-network/ethereum-phishing-transaction-network.csv'):
dfnodes.to_csv('/mnt/data/ethereum-phishing-transaction-network/nodes.csv', index=False)
df.to_csv('/mnt/data/ethereum-phishing-transaction-network/ethereum-phishing-transaction-network.csv', index=False)

In [None]:
names = [
    'nonce',
    'from_address',
    'to_address',
    #'transaction_index',
    'value',
    'gas',
    'gas_price',
    #'receipt_status',
    'block_timestamp',
    # 'phishing',
]
dtypes = {
    'nonce': 'float64',
    'from_address': 'str',
    'to_address': 'str',
    'transaction_index': 'category',
    'value': 'float64',
    'gas': 'float64',
    'gas_price': 'float64',
    #'receipt_status': 'category',
}
df = pd.read_csv('/mnt/data/ethereum-phishing-transaction-network/ethereum-phishing-transaction-network.csv', usecols=names, dtype=dtypes,  parse_dates=['block_timestamp'])
# print statistics for each column of the dataframe
for col in df.columns:
    print(f"Statistics for column {col}")
    print(df[col].describe())
    # print the number of nans in the column
    print(f"Number of nans in column {col}: {df[col].isna().sum()}")
    print()

In [None]:
%load_ext autoreload
%autoreload 2
# add parent directory to the path
import sys
sys.path.append('../')
from src.datasets import EthereumPhishingTransactions, EthereumPhishingNodes
from src.datasets.util.mask import PretrainType
import pandas as pd

In [None]:
dataset = EthereumPhishingTransactions(
    root='/mnt/data/ethereum-phishing-transaction-network/ethereum-phishing-transaction-network-c.csv', 
    pretrain={PretrainType.MASK, PretrainType.LINK_PRED},
    split_type='temporal',
    
    splits=[0.6, 0.2, 0.2], 
    khop_neighbors=[100, 100]
)
start = time.time()
dataset.materialize()
logger.info(f"Dataset materialized in {time.time()-start} seconds.")

In [None]:
nodes = EthereumPhishingNodes(root='/mnt/data/ethereum-phishing-transaction-network/nodes.csv')
nodes.materialize()

In [None]:
from torch_frame.data import DataLoader
train_dataset, val_dataset, test_dataset = nodes.split()
tensor_frame = nodes.tensor_frame 
train_loader = DataLoader(train_dataset.tensor_frame, batch_size=2048, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset.tensor_frame, batch_size=2048, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset.tensor_frame, batch_size=2048, shuffle=False, num_workers=4)

In [None]:
batch = next(iter(train_loader))
print(batch)
print(batch.y)
print(sum(batch.y))

In [None]:
df = pd.read_csv('/mnt/data/ethereum-phishing-transaction-network/dummy-c.csv')
m = max(df['from_address'].max(), df['to_address'].max())
dummynodes = pd.DataFrame({'node': range(m+1), 'label': 0})
dummynodes.to_csv('/mnt/data/ethereum-phishing-transaction-network/dummy-nodes.csv', index=False)

In [None]:
edges = pd.read_csv('/mnt/data/ethereum-phishing-transaction-network/ethereum-phishing-transaction-network.csv')
nodes= pd.read_csv('/mnt/data/ethereum-phishing-transaction-network/nodes.csv')

In [None]:
edges = pd.read_csv('/mnt/data/ethereum-phishing-transaction-network/dummy.csv')
nodes= pd.read_csv('/mnt/data/ethereum-phishing-transaction-network/dummy-nodes.csv')

In [None]:
import pandas as pd

# Assuming nodes and edges are pandas DataFrames

# Get the first transaction timestamp for each from_address
from_min_timestamp = edges.groupby('from_address')['block_timestamp'].min().reset_index()
from_min_timestamp.columns = ['node', 'first_transaction']

# Get the first transaction timestamp for each to_address
to_min_timestamp = edges.groupby('to_address')['block_timestamp'].min().reset_index()
to_min_timestamp.columns = ['node', 'first_transaction']

# Concatenate both results and take the minimum timestamp for each node
combined = pd.concat([from_min_timestamp, to_min_timestamp])

# Group by node and take the earliest transaction
first_transactions = combined.groupby('node')['first_transaction'].min().reset_index()

# Merge with the nodes DataFrame
nodes = nodes.merge(first_transactions, on='node', how='left')


nodes.head()

In [None]:
# Save the result
nodes.to_csv('/mnt/data/ethereum-phishing-transaction-network/dummy-nodes.csv', index=False)

In [32]:
print(max_deg_address)

0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98


In [44]:
import requests

# Define the API endpoint and parameters
url = "https://api.etherscan.io/api"
params = {
    "module": "account",
    "action": "txlist",
    "address": max_deg_address,
    "startblock": 0,
    "endblock": 99999999,
    "page": 2,
    # "offset": 10,
    "sort": "asc",
    "apikey": "GKC6BUADY2UHP99QKRG4CSPZR3B13WVG3W"  # Replace with your actual API key
}

# Send the GET request to the Etherscan API
response = requests.get(url, params=params)
print(response)

# Check the status of the response and print the result
if response.status_code == 200:
    data = response.json()  # Parse the JSON response
    print(data.keys())
    print(data['status'])
    print(data['message'])
    df = pd.DataFrame(data['result'])
else:
    print(f"Error: {response.status_code}")



2024-08-27 12:53:37 - DEBUG - Starting new HTTPS connection (1): api.etherscan.io:443
2024-08-27 12:53:39 - DEBUG - https://api.etherscan.io:443 "GET /api?module=account&action=txlist&address=0xfbb1b73c4f0bda4f67dca266ce6ef42f520fbb98&startblock=0&endblock=99999999&page=2&sort=asc&apikey=GKC6BUADY2UHP99QKRG4CSPZR3B13WVG3W HTTP/1.1" 200 None


<Response [200]>
dict_keys(['status', 'message', 'result'])
1
OK


In [41]:
# data.result to a pandas dataframe
print(df.head())
print(len(df))

  blockNumber                                          blockHash   timeStamp  \
0       61558  0x6a734fa758d46b9916ef26d32b0bd8279d139133490e...  1439174352   
1       61696  0x48788b654e284173a516fc814d8a56e20807db5ecf54...  1439176611   
2       61835  0x50b1f309ba5efa185f6ceec265937f467328afd1cf99...  1439178989   
3       64638  0x0969c970f65cc7b01f8b2d060cbaddf048d7d2409d67...  1439223994   
4       65407  0x0b14125ff9fdb36bef8a566c1a3cf6aa1356a366b2f4...  1439236394   

                                                hash nonce transactionIndex  \
0  0x1929132f5b00e59fb817cf37defb14bba19bb955a4fc...   141                0   
1  0xa132135c879b733dec65d32c19db87a4762e5d281898...   142                1   
2  0x56a07d54516a02a66b2a5e95f3cf5985815062dbb12a...     0                0   
3  0xe5f3f47f1a8e607294f6ba52fb4ba6f89126171f49d4...   181                1   
4  0x3dc804035ebfcfde8ac771b4c58f38bb4865fc8a03b2...     1                0   

                                         fro

In [45]:
# data.result to a pandas dataframe
print(df.head())
print(len(df))

  blockNumber                                          blockHash   timeStamp  \
0       61558  0x6a734fa758d46b9916ef26d32b0bd8279d139133490e...  1439174352   
1       61696  0x48788b654e284173a516fc814d8a56e20807db5ecf54...  1439176611   
2       61835  0x50b1f309ba5efa185f6ceec265937f467328afd1cf99...  1439178989   
3       64638  0x0969c970f65cc7b01f8b2d060cbaddf048d7d2409d67...  1439223994   
4       65407  0x0b14125ff9fdb36bef8a566c1a3cf6aa1356a366b2f4...  1439236394   

                                                hash nonce transactionIndex  \
0  0x1929132f5b00e59fb817cf37defb14bba19bb955a4fc...   141                0   
1  0xa132135c879b733dec65d32c19db87a4762e5d281898...   142                1   
2  0x56a07d54516a02a66b2a5e95f3cf5985815062dbb12a...     0                0   
3  0xe5f3f47f1a8e607294f6ba52fb4ba6f89126171f49d4...   181                1   
4  0x3dc804035ebfcfde8ac771b4c58f38bb4865fc8a03b2...     1                0   

                                         fro

In [28]:

# File path
file_path = '/mnt/data/ethereum-phishing-transaction-network/lol.csv'
# Check if the file exists, then decide whether to write the header
if os.path.exists(file_path):
    df.to_csv(file_path, mode='a', index=False, header=False)
else:
    df.to_csv(file_path, mode='w', index=False, header=True)