In [1]:
import logging
# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Specify the log message format
    datefmt='%Y-%m-%d %H:%M:%S',  # Specify the date format
    handlers=[
        #logging.FileHandler('app.log'),  # Log messages to a file
        logging.StreamHandler()  # Also output log messages to the console
    ]
)
logger = logging.getLogger(__name__)
import pickle 
import networkx as nx
import time
from datetime import datetime, timezone
import os
import sys
import pandas as pd

In [2]:
"""The data stored in pickle format with version: 0.7.5 (python 3.7).
The type of graph object：networkx.classes.multidigraph.MultiDiGraph
Numbers of nodes: 2973489
Numbers of edges: 13551303
Average degree:   4.5574
Nodes' features：
    // The label. 1 means fishing mark node, otherwise 0.
    G.nodes[nodeName]['isp']；

Edges' features:
    G[node1][node2][0]['amount']        // The amount mount of the transaction.
    G[node1][node2][0]['timestamp']     // The timestamp of the transaction.				
							
* Notes * 
"""

def load_pickle(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)
logger.info("Loading the graph")
start = time.time()
G = load_pickle('/mnt/data/ethereum-phishing-transaction-network/MulDiGraph.pkl')
logger.info(f"Graph loaded in {time.time()-start} seconds.")
logger.info(f'Number of nodes: {G.number_of_nodes()}')
logger.info(f'Number of edges: {G.number_of_edges()}')


# save nodes and their labels to a csv
logger.info("Saving nodes and their labels to a csv")
start = time.time()
nodes = []
labels = []
for node in G.nodes():
    nodes.append(node)
    labels.append(G.nodes[node]['isp'])
dfnodes = pd.DataFrame({'node': nodes, 'label': labels})

2024-08-11 14:08:07 - INFO - Loading the graph
2024-08-11 14:08:22 - INFO - Graph loaded in 14.570035457611084 seconds.
2024-08-11 14:08:22 - INFO - Number of nodes: 2973489
2024-08-11 14:08:24 - INFO - Number of edges: 13551303
2024-08-11 14:08:24 - INFO - Saving nodes and their labels to a csv


In [3]:
logger.info("Processing the graph")
uniq = set()
start = time.time()
min_time = 1e20
max_time = 0
from_address = []
to_address = []
time_stamps = []
amounts = []

for ind, edge in enumerate(nx.edges(G)):
    (u, v) = edge
    eg = G[u][v][0]
    amo, tim = eg['amount'], eg['timestamp']
    uniq.add((u, v, tim))
    #uniq.add((u, v, amo, tim))
    min_time = min(min_time, tim)
    max_time = max(max_time, tim)
min_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(min_time))
max_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(max_time))
# print(min_time, max_time)
# print(len(uniq))
logger.info(f"Graph processed in {time.time()-start} seconds.")
logger.info(f'Number of unique transactions: {len(uniq)}')
#print an element from uniq
logger.info(f"An element from uniq: {list(uniq)[0]}")
logger.info(f'Minimum timestamp: {min_time}')
logger.info(f'Maximum timestamp: {max_time}')

2024-08-11 14:08:26 - INFO - Processing the graph
2024-08-11 14:08:38 - INFO - Graph processed in 12.710413932800293 seconds.
2024-08-11 14:08:38 - INFO - Number of unique transactions: 5355155
2024-08-11 14:08:38 - INFO - An element from uniq: ('0x5b2c0ea6fb7130a43bd70bef163f4d7ec72a69f1', '0xb6ee9668771a79be7967ee29a63d4184f8097143', 1517497803.0)
2024-08-11 14:08:38 - INFO - Minimum timestamp: 2015-08-07 07:01:09
2024-08-11 14:08:38 - INFO - Maximum timestamp: 2019-01-19 09:32:09


In [4]:
import os
transactions_path = '/mnt/data/ethereum-phishing-transaction-network/transactions-c.csv'
if os.path.exists(transactions_path):
    columns_to_read = ['from_address', 'to_address', 'nonce', 'gas', 'gas_price', 'value', 'block_timestamp']
    edges = pd.read_csv(transactions_path, usecols=columns_to_read) 
    print(f"Read {len(edges)} transactions from the csv")
    #df['value'] = df['value'].astype(float)
    edges['block_timestamp'] = pd.to_datetime(edges['block_timestamp']).view('int64') / 10**9
    # print a row of the dataframe
    print(f"A row from the dataframe: {edges.iloc[0]}")
    #exists = set(zip(df['from_address'], df['to_address'], df['value'], df['block_timestamp']))
    exists = set(zip(edges['from_address'], edges['to_address'], edges['block_timestamp']))
    diff = uniq - exists

Read 5256894 transactions from the csv
A row from the dataframe: nonce                                                    1746
from_address       0xc3d9c17d7f6988c0fe7ebe929c47efccbd92be13
to_address         0x0d64b06929f797d641064521705714498618fe1e
value                                     5550000000000000000
gas                                                     90000
gas_price                                         20417458463
block_timestamp                                  1494731739.0
Name: 0, dtype: object


  edges['block_timestamp'] = pd.to_datetime(edges['block_timestamp']).view('int64') / 10**9


In [5]:
# compare the transactions in the graph with the transactions in the csv
print("Number of transactions in the csv: ", len(exists))
print("Number of transactions in the graph but not in the csv: ", len(diff))
print("Number of transactions in the csv but not in the graph: ", len(exists - uniq))

Number of transactions in the csv:  5242514
Number of transactions in the graph but not in the csv:  136344
Number of transactions in the csv but not in the graph:  23703


In [6]:
# compare the nodes in the graph with the from_address and to_address in the csv
nodes = set(G.nodes())
from_addresses = set(edges['from_address'])
to_addresses = set(edges['to_address'])
csv_nodes = from_addresses.union(to_addresses)
print(f"Number of nodes in the graph: {len(nodes)}")
print(f"Number of nodes in the csv: {len(csv_nodes)}")
print(f"Number of nodes in the graph but not in the csv: {len(nodes - csv_nodes)}")
print(f"Number of nodes in the csv but not in the graph: {len(csv_nodes - nodes)}")
# what is the percentage of labels of the nodes in the graph but not in the csv
missing_nodes = list(nodes-csv_nodes)
missing_labels = [G.nodes[node]['isp'] for node in missing_nodes]
print(f"Percentage of labels in the graph but not in the csv: {sum(missing_labels)/len(missing_labels)}")
print(f"Ilicit missing nodes {sum(missing_labels)}")
labels = [G.nodes[node]['isp'] for node in csv_nodes] 


Number of nodes in the graph: 2973489
Number of nodes in the csv: 2890207
Number of nodes in the graph but not in the csv: 83282
Number of nodes in the csv but not in the graph: 0
Percentage of labels in the graph but not in the csv: 7.204437933767201e-05
Ilicit missing nodes 6


In [7]:
df_nodes = pd.DataFrame({'node': list(csv_nodes), 'label': labels})

In [8]:
import numpy as np
# renumber from_address and to_address starting from 0 and update the dataframe
unique_ids = pd.concat([edges['from_address'], edges['to_address']]).unique()
# create a mapping from the original ID to a new ID
id_map = {old_id: new_id for new_id, old_id in enumerate(unique_ids)}
# replace the original IDs with the new IDs
edges['from_address'] = edges['from_address'].apply(lambda x: id_map[x])
edges['to_address'] = edges['to_address'].apply(lambda x: id_map[x])
# convert timestamp to UNIX
#df['block_timestamp'] = pd.to_datetime(df['block_timestamp']).astype('int64') / 10**9

# re label dfnodes with the new ids
df_nodes['node'] = df_nodes['node'].apply(lambda x: id_map[x])

# normalize numerical columns
for col in edges.columns:
    print(col)
    if col in ['from_address', 'to_address', 'phishing', 'receipt_status', 'block_timestamp', 'transaction_index']:
        continue
    edges[col] = edges[col].astype(float)
    edges[col] = np.log1p(edges[col])
    edges[col] = (edges[col] - edges[col].min()) / (edges[col].max() - edges[col].min())

nonce
from_address
to_address
value
gas
gas_price
block_timestamp


In [9]:
print(df_nodes.head())
len(df_nodes)

      node  label
0  2779686      0
1   760873      0
2  1496063      0
3   200774      0
4  2336759      0


2890207

In [10]:
import pandas as pd

# Assuming nodes and edges are pandas DataFrames

# Get the first transaction timestamp for each from_address
from_min_timestamp = edges.groupby('from_address')['block_timestamp'].min().reset_index()
from_min_timestamp.columns = ['node', 'first_transaction']

# Get the first transaction timestamp for each to_address
to_min_timestamp = edges.groupby('to_address')['block_timestamp'].min().reset_index()
to_min_timestamp.columns = ['node', 'first_transaction']

# Concatenate both results and take the minimum timestamp for each node
combined = pd.concat([from_min_timestamp, to_min_timestamp])

# Group by node and take the earliest transaction
first_transactions = combined.groupby('node')['first_transaction'].min().reset_index()

# Merge with the nodes DataFrame
df_nodes = df_nodes.merge(first_transactions, on='node', how='left')
df_nodes.head()

Unnamed: 0,node,label,first_transaction
0,2779686,0,1520490000.0
1,760873,0,1517382000.0
2,1496063,0,1498833000.0
3,200774,0,1534206000.0
4,2336759,0,1515501000.0


In [11]:
df_nodes.to_csv('/mnt/data/ethereum-phishing-transaction-network/nodes.csv', index=False)
edges.to_csv('/mnt/data/ethereum-phishing-transaction-network/edges.csv', index=False)

In [17]:
dummy_edges = edges.head(1000)
unique_ids = pd.concat([dummy_edges['from_address'], dummy_edges['to_address']]).unique()
id_map = {old_id: new_id for new_id, old_id in enumerate(unique_ids)}
dummy_edges['from_address'] = dummy_edges['from_address'].apply(lambda x: id_map[x])
dummy_edges['to_address'] = dummy_edges['to_address'].apply(lambda x: id_map[x])

mask = df_nodes["node"].isin(unique_ids)
dummy_nodes = df_nodes[mask]
dummy_nodes['node'] = dummy_nodes['node'].apply(lambda x: id_map[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dummy_edges['from_address'] = dummy_edges['from_address'].apply(lambda x: id_map[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dummy_edges['to_address'] = dummy_edges['to_address'].apply(lambda x: id_map[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dummy_nodes['node'] = dummy_nodes['node

In [18]:
dummy_nodes.to_csv('/mnt/data/ethereum-phishing-transaction-network-dummy/nodes.csv', index=False)
dummy_edges.to_csv('/mnt/data/ethereum-phishing-transaction-network-dummy/edges.csv', index=False)

In [2]:
%load_ext autoreload
%autoreload 2
# add parent directory to the path
import sys
sys.path.append('../')
from src.datasets import EthereumPhishingTransactions, EthereumPhishingNodes
from src.datasets.util.mask import PretrainType
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
dataset = EthereumPhishingTransactions(
    root='/mnt/data/ethereum-phishing-transaction-network/ethereum-phishing-transaction-network-c.csv', 
    pretrain={PretrainType.MASK, PretrainType.LINK_PRED},
    split_type='temporal',
    
    splits=[0.6, 0.2, 0.2], 
    khop_neighbors=[100, 100]
)
start = time.time()
dataset.materialize()
logger.info(f"Dataset materialized in {time.time()-start} seconds.")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [9]:
nodes = EthereumPhishingNodes(root='/mnt/data/ethereum-phishing-transaction-network/nodes.csv')
nodes.materialize()

Masked applied
Tensor frame created


EthereumPhishingNodes()

In [14]:
from torch_frame.data import DataLoader
train_dataset, val_dataset, test_dataset = nodes.split()
tensor_frame = nodes.tensor_frame 
train_loader = DataLoader(train_dataset.tensor_frame, batch_size=2048, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset.tensor_frame, batch_size=2048, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset.tensor_frame, batch_size=2048, shuffle=False, num_workers=4)

In [16]:
batch = next(iter(train_loader))
print(batch)
print(batch.y)
print(sum(batch.y))

TensorFrame(
  num_cols=1,
  num_rows=2048,
  numerical (1): ['node'],
  has_target=True,
  device='cpu',
)
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor(2)
