In [None]:
import requests

API_KEY = "BRI6G5PQGWG5VX1R8CIKP1MW1HJIXZE8GC"

def get_transactions(address):
    url = "https://api.etherscan.io/v2/api"

    params = {
        "chainid": 1,                
        "module": "account",
        "action": "txlist",
        "address": address,
        "startblock": 0,
        "endblock": 99999999,
        "sort": "asc",
        "apikey": API_KEY
    }

    response = requests.get(url, params=params).json()

    if response.get("status") != "1":
        print(f"{address} is not having any transection record")
        return []

    return response["result"]


In [None]:
import pandas as pd
from collections import deque
import time, random
from tqdm import tqdm

MAX_DEPTH = 2
MAX_VISITED = 300              
MAX_TX_PER_ADDRESS = 20       
NEIGHBOR_SAMPLE_SIZE = 2       
SLEEP_TIME = 0.15

phishing = pd.read_csv("phishingaddress.csv", header=None, names=["address"])
start_nodes = phishing["address"].str.lower().unique()[:10]

visited = set()
queue = deque([(addr, 0) for addr in start_nodes])
transactions = []

pbar = tqdm(total=MAX_VISITED, desc="Graph Expansion")

while queue and len(visited) < MAX_VISITED:
    addr, depth = queue.popleft()

    if addr in visited or depth > MAX_DEPTH:
        continue

    visited.add(addr)
    pbar.update(1)
    pbar.set_description(
        f"Depth {depth} | Visited {len(visited)} | Queue {len(queue)} | Txs {len(transactions)}"
    )

    try:
        txs = get_transactions(addr)

        if not isinstance(txs, list) or len(txs) == 0:
            continue

        if len(txs) > 100:
            continue

        txs = txs[:MAX_TX_PER_ADDRESS]

        for tx in txs:
            if not isinstance(tx, dict):
                continue

            frm = tx.get("from", "").lower()
            to = tx.get("to", "")
            to = to.lower() if to else ""

            transactions.append(tx)

            if depth < MAX_DEPTH:
                neighbors = [n for n in (frm, to) if n and n not in visited]
                neighbors = random.sample(
                    neighbors, min(len(neighbors), NEIGHBOR_SAMPLE_SIZE)
                )

                for n in neighbors:
                    queue.append((n, depth + 1))

        time.sleep(SLEEP_TIME)

    except Exception as e:
        print(f"Error processing {addr}: {e}")

pbar.close()

if transactions:
    pd.DataFrame(transactions).to_csv(
        "transactions_expanded.csv", index=False
    )
    print(f"Done: {len(transactions)} txs, {len(visited)} addresses")
else:
    print("No transactions collected")



Depth 2 | Visited 193 | Queue 339 | Txs 1123:  32%|███▏      | 193/600 [04:36<09:43,  1.43s/it]

Depth 0 | Visited 1 | Queue 9 | Txs 0:   0%|          | 1/300 [00:00<00:02, 129.57it/s][A
Depth 0 | Visited 1 | Queue 9 | Txs 0:   1%|          | 2/300 [00:00<01:27,  3.39it/s] [A
Depth 0 | Visited 2 | Queue 28 | Txs 20:   1%|          | 2/300 [00:00<01:27,  3.39it/s][A
Depth 0 | Visited 2 | Queue 28 | Txs 20:   1%|          | 3/300 [00:01<01:59,  2.49it/s][A
Depth 0 | Visited 3 | Queue 29 | Txs 22:   1%|          | 3/300 [00:01<01:59,  2.49it/s][A
Depth 0 | Visited 3 | Queue 29 | Txs 22:   1%|▏         | 4/300 [00:01<02:21,  2.09it/s][A
Depth 0 | Visited 4 | Queue 48 | Txs 42:   1%|▏         | 4/300 [00:01<02:21,  2.09it/s][A
Depth 0 | Visited 4 | Queue 48 | Txs 42:   2%|▏         | 5/300 [00:02<02:25,  2.03it/s][A
Depth 0 | Visited 5 | Queue 56 | Txs 51:   2%|▏         | 5/300 [00:02<02:25,  2.03it/s][A
Depth 0 | Visited 5 | Queue 56 | Txs 51:   2%|▏         | 6/300 [00:02<02:39

0x11c058c3efbf53939fb6872b09a2b5cf2410a1e2c3f3c867664e43a626d878c0 is not having any transection record



Depth 0 | Visited 8 | Queue 93 | Txs 91:   3%|▎         | 9/300 [00:04<02:32,  1.90it/s][A
Depth 0 | Visited 9 | Queue 92 | Txs 91:   3%|▎         | 9/300 [00:04<02:32,  1.90it/s][A
Depth 0 | Visited 9 | Queue 92 | Txs 91:   3%|▎         | 10/300 [00:05<02:55,  1.65it/s][A
Depth 0 | Visited 10 | Queue 91 | Txs 91:   3%|▎         | 10/300 [00:05<02:55,  1.65it/s][A
Depth 0 | Visited 10 | Queue 91 | Txs 91:   4%|▎         | 11/300 [00:05<02:53,  1.67it/s][A
Depth 1 | Visited 11 | Queue 90 | Txs 91:   4%|▎         | 11/300 [00:05<02:53,  1.67it/s][A
Depth 1 | Visited 11 | Queue 90 | Txs 91:   4%|▍         | 12/300 [00:06<02:40,  1.79it/s][A
Depth 1 | Visited 12 | Queue 90 | Txs 93:   4%|▍         | 12/300 [00:06<02:40,  1.79it/s][A
Depth 1 | Visited 12 | Queue 90 | Txs 93:   4%|▍         | 13/300 [00:06<02:33,  1.87it/s][A
Depth 1 | Visited 13 | Queue 91 | Txs 96:   4%|▍         | 13/300 [00:06<02:33,  1.87it/s][A
Depth 1 | Visited 13 | Queue 91 | Txs 96:   5%|▍         | 14/30

✅ Done: 1906 txs, 300 addresses





In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("transactions_expanded.csv")

df = df[["from", "to", "value", "timeStamp", "blockNumber"]]

df = df.dropna(subset=["from", "to"])

df = df[df["from"] != df["to"]]

df["value"] = pd.to_numeric(df["value"], errors="coerce")
df = df[df["value"] > 0]

df["value"] = df["value"] / 1e18

df["value"] = np.log1p(df["value"])

df["timeStamp"] = df["timeStamp"].astype(int)
df["blockNumber"] = df["blockNumber"].astype(int)

df["from"] = df["from"].str.lower()
df["to"] = df["to"].str.lower()

df.to_csv("transactions_clean.csv", index=False)

print(f"Cleaned transactions saved: {len(df)} rows")


✅ Cleaned transactions saved: 1471 rows


In [None]:
import pandas as pd
import networkx as nx

df = pd.read_csv("transactions_clean.csv")

G = nx.DiGraph()

for _, row in df.iterrows():
    G.add_edge(
        row["from"],
        row["to"],
        value=row["value"],
        time=row["timeStamp"]
    )

print("Graph built")
print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())


✅ Graph built
Nodes: 667
Edges: 871


In [None]:
import pandas as pd
import networkx as nx

df = pd.read_csv("transactions_clean.csv")

G = nx.DiGraph()
for _, row in df.iterrows():
    G.add_edge(row["from"], row["to"], value=row["value"])

rows = []

for node in G.nodes():
    incoming = sum(G[u][node]["value"] for u in G.predecessors(node))
    outgoing = sum(G[node][v]["value"] for v in G.successors(node))

    rows.append({
        "address": node,
        "in_degree": G.in_degree(node),
        "out_degree": G.out_degree(node),
        "total_in": incoming,
        "total_out": outgoing,
        "balance": incoming - outgoing
    })

node_df = pd.DataFrame(rows)
node_df.to_csv("node_features.csv", index=False)

print(f"Node features saved: {len(node_df)} nodes")


✅ Node features saved: 667 nodes


In [None]:
import pandas as pd

nodes = pd.read_csv("node_features.csv")

phishing = pd.read_csv(
    "phishingaddress.csv",
    header=None,
    names=["address"]
)

phishing_set = set(phishing["address"].str.lower())

nodes["label"] = nodes["address"].apply(
    lambda x: 1 if x in phishing_set else 0
)

nodes.to_csv("node_features_labeled.csv", index=False)

print("Labels assigned")
print(nodes["label"].value_counts())


✅ Labels assigned
label
0    657
1     10
Name: count, dtype: int64


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

df = pd.read_csv("node_features_labeled.csv")

X = df[["in_degree", "out_degree", "total_in", "total_out", "balance"]]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

model = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       198
           1       0.00      0.00      0.00         3

    accuracy                           0.98       201
   macro avg       0.49      0.50      0.49       201
weighted avg       0.97      0.98      0.98       201

